-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[X86] Fix spill issue for fr16 #155225
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Fix spill issue for fr16 #155225
Conversation
@llvm/pr-subscribers-backend-x86 Author: Luo, Yuanke (LuoYuanke) ChangesWhen avx512fp16 is not available, we use MOVSS to spill fr16/fr16x register. Patch is 20.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155225.diff 8 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365eedec39..df94b4e6cd26f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4399,13 +4399,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
if (STI.hasFP16())
return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
if (Load)
- return STI.hasAVX512() ? X86::VMOVSSZrm
- : STI.hasAVX() ? X86::VMOVSSrm
- : X86::MOVSSrm;
- else
- return STI.hasAVX512() ? X86::VMOVSSZmr
- : STI.hasAVX() ? X86::VMOVSSmr
- : X86::MOVSSmr;
+ return X86::MOVSHPrm;
+ return X86::MOVSHPmr;
}
static unsigned getLoadStoreRegOpcode(Register Reg,
@@ -6131,6 +6126,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
return true;
}
+static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI,
+ const TargetInstrInfo &TII) {
+ unsigned NewOpc;
+ if (MI.getOpcode() == X86::MOVSHPrm) {
+ NewOpc = X86::MOVSSrm;
+ Register Reg = MI.getOperand(0).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZrm;
+ } else {
+ NewOpc = X86::MOVSSmr;
+ Register Reg = MI.getOperand(5).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZmr;
+ }
+
+ MIB->setDesc(TII.get(NewOpc));
+ return true;
+}
+
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -6203,6 +6217,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
+ case X86::MOVSHPmr:
+ case X86::MOVSHPrm:
+ return expandMOVSHP(MIB, MI, *this);
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB,
get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 1acc0cd8da205..b7926497c92ba 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
}
}
+// pseudo instruction for fp16 spilling.
+let isPseudo = 1, Predicates = [HasSSE2] in {
+ let mayStore = 1 in
+ def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFStore]>;
+ let mayLoad = 1 in
+ def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFLoad]>;
+}
+
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
SSEPackedSingle, UseSSE1>, TB, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll
index 6d22f669725a2..ecb551f8eb08f 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll
@@ -13,11 +13,11 @@ define <2 x half> @foo(<2 x half> %0) nounwind {
; AVX2-NEXT: callq __extendhfsf2@PLT
; AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: callq __truncsfhf2@PLT
-; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __extendhfsf2@PLT
; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: callq __extendhfsf2@PLT
; AVX2-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 8b3aa2964db02..c29adc8e192e4 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -100,10 +100,10 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; AVX-LABEL: complex_canonicalize_fmul_half:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
-; AVX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
-; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index d59b12c6d1231..9a7d68b76622a 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -277,34 +277,34 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
@@ -312,7 +312,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -320,13 +320,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -336,13 +336,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -350,13 +350,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1111,7 +1111,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax
; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
@@ -1121,7 +1121,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/fp16-reload.mir b/llvm/test/CodeGen/X86/fp16-reload.mir
new file mode 100644
index 0000000000000..ddbd48cbf3ee7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp16-reload.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown -start-before=twoaddressinstruction -stop-after=postrapseudos -verify-machineinstrs -o - %s | FileCheck %s
+
+...
+---
+name: test
+alignment: 16
+tracksRegLiveness: true
+debugInstrRef: true
+registers:
+liveins:
+ - { reg: '$xmm0', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+ hasCalls: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $xmm0
+
+ ; CHECK-LABEL: name: test
+ ; CHECK: liveins: $xmm0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: MOVSSmr $rsp, 1, $noreg, -4, $noreg, $xmm0 :: (store (s32) into %stack.0, align 2)
+ ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+ ; CHECK-NEXT: renamable $xmm0 = MOVSSrm $rsp, 1, $noreg, -4, $noreg :: (load (s32) from %stack.0, align 2)
+ ; CHECK-NEXT: FNOP implicit-def $fpsw, implicit killed renamable $xmm0
+ ; CHECK-NEXT: RET 0
+ %0:fr16 = COPY killed $xmm0
+ INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+ FNOP implicit-def $fpsw, implicit %0:fr16
+ RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/fp16-spill.ll b/llvm/test/CodeGen/X86/fp16-spill.ll
new file mode 100644
index 0000000000000..9c0b506fc0ef8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp16-spill.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
+
+define half @test(float %f, ptr %p) nounwind {
+; SSE2-LABEL: test:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: subq $16, %rsp
+; SSE2-NEXT: movq %rdi, %rbx
+; SSE2-NEXT: callq __truncsfhf2@PLT
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: callq __extendhfsf2@PLT
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: #APP
+; SSE2-NEXT: #NO_APP
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss %xmm0, (%rbx)
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: addq $16, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test:
+; AVX: # %bb.0:
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: subq $16, %rsp
+; AVX-NEXT: movq %rdi, %rbx
+; AVX-NEXT: callq __truncsfhf2@PLT
+; AVX-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT: callq __extendhfsf2@PLT
+; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT: #APP
+; AVX-NEXT: #NO_APP
+; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss %xmm0, (%rbx)
+; AVX-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: addq $16, %rsp
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: #APP
+; AVX512-NEXT: #NO_APP
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vmovss %xmm0, (%rdi)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: retq
+ %t = fptrunc float %f to half
+ %t2 = fpext half %t to float
+ tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
+ store float %t2, ptr %p
+ ret half %t
+}
diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll
index 959265d08299a..1edc22f503506 100644
--- a/llvm/test/CodeGen/X86/frem.ll
+++ b/llvm/test/CodeGen/X86/frem.ll
@@ -9,11 +9,11 @@ define void @frem_f16(half %a0, half %a1, ptr%p3) nounwind {
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $16, %rsp
; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEX...
[truncated]
|
When avx512fp16 is not available, we use MOVSS to spill fr16/fr16x register. However The MOVSSmr require fr32 register class and MOVSSrm require vr128 register class which cause bad instruction detected by machine verifier. To fix the issue this patch is to create a pseudo instruction MOVSHP for fr16 register spilling. MOVSHP is expanded to MOVSS or VMOVSSZ depending on the register number.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one nit.
It seems I don't have the permission to merge the patch. @phoebewang, Could you help to land the patch? |
When avx512fp16 is not available, we use MOVSS to spill fr16/fr16x register.
However The MOVSSmr require fr32 register class and MOVSSrm require vr128
register class which cause bad instruction detected by machine verifier.
To fix the issue this patch is to create a pseudo instruction MOVSHP for
fr16 register spilling. MOVSHP is expanded to MOVSS or VMOVSSZ depending
on the register number.