Skip to content

Commit 1084b32

Browse files
committed
[ARM] Always replace FP16 bitcasts with VMOVhr or VMOVrh
This changes the logic with lowering fp16 bitcasts to always produce either a VMOVhr or a VMOVrh, instead of only trying to do it with certain surrounding nodes. To perform the same optimisations demand bits and known bits information has been added for them. Differential Revision: https://reviews.llvm.org/D78587
1 parent 25a4b19 commit 1084b32

File tree

3 files changed

+44
-93
lines changed

3 files changed

+44
-93
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 34 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5752,57 +5752,25 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
57525752
SDLoc dl(N);
57535753
SDValue Op = N->getOperand(0);
57545754

5755-
// This function is only supposed to be called for i64 types, either as the
5756-
// source or destination of the bit convert.
5755+
// This function is only supposed to be called for i16 and i64 types, either
5756+
// as the source or destination of the bit convert.
57575757
EVT SrcVT = Op.getValueType();
57585758
EVT DstVT = N->getValueType(0);
5759-
const bool HasFullFP16 = Subtarget->hasFullFP16();
57605759

57615760
if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
5762-
if (!HasFullFP16)
5761+
if (!Subtarget->hasFullFP16())
57635762
return SDValue();
5764-
// SoftFP: read half-precision arguments:
5765-
//
5766-
// t2: i32,ch = ...
5767-
// t7: i16 = truncate t2 <~~~~ Op
5768-
// t8: f16 = bitcast t7 <~~~~ N
5769-
//
5770-
if (Op.getOperand(0).getValueType() == MVT::i32)
5771-
return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
5772-
MVT::f16, Op.getOperand(0));
5773-
5774-
return SDValue();
5763+
// f16 bitcast i16 -> VMOVhr
5764+
return DAG.getNode(ARMISD::VMOVhr, SDLoc(N), MVT::f16,
5765+
DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
57755766
}
57765767

5777-
// Half-precision return values
57785768
if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
5779-
if (!HasFullFP16)
5769+
if (!Subtarget->hasFullFP16())
57805770
return SDValue();
5781-
//
5782-
// t11: f16 = fadd t8, t10
5783-
// t12: i16 = bitcast t11 <~~~ SDNode N
5784-
// t13: i32 = zero_extend t12
5785-
// t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
5786-
// t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
5787-
//
5788-
// transform this into:
5789-
//
5790-
// t20: i32 = ARMISD::VMOVrh t11
5791-
// t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
5792-
//
5793-
auto ZeroExtend = N->use_begin();
5794-
if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
5795-
ZeroExtend->getValueType(0) != MVT::i32)
5796-
return SDValue();
5797-
5798-
auto Copy = ZeroExtend->use_begin();
5799-
if (Copy->getOpcode() == ISD::CopyToReg &&
5800-
Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
5801-
SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
5802-
DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
5803-
return Cvt;
5804-
}
5805-
return SDValue();
5771+
// i16 bitcast f16 -> VMOVrh
5772+
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16,
5773+
DAG.getNode(ARMISD::VMOVrh, SDLoc(N), MVT::i32, Op));
58065774
}
58075775

58085776
if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
@@ -13019,16 +12987,25 @@ static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &
1301912987
// t2: f32,ch = CopyFromReg t0, Register:f32 %0
1302012988
// t5: i32 = bitcast t2
1302112989
// t18: f16 = ARMISD::VMOVhr t5
13022-
SDValue BC = N->getOperand(0);
13023-
if (BC->getOpcode() != ISD::BITCAST)
13024-
return SDValue();
13025-
SDValue Copy = BC->getOperand(0);
13026-
if (Copy.getValueType() != MVT::f32 || Copy->getOpcode() != ISD::CopyFromReg)
13027-
return SDValue();
12990+
SDValue Op0 = N->getOperand(0);
12991+
if (Op0->getOpcode() == ISD::BITCAST) {
12992+
SDValue Copy = Op0->getOperand(0);
12993+
if (Copy.getValueType() == MVT::f32 &&
12994+
Copy->getOpcode() == ISD::CopyFromReg) {
12995+
SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
12996+
SDValue NewCopy =
12997+
DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), MVT::f16, Ops);
12998+
return NewCopy;
12999+
}
13000+
}
1302813001

13029-
SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
13030-
SDValue NewCopy = DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), MVT::f16, Ops);
13031-
return NewCopy;
13002+
// Only the bottom 16 bits of the source register are used.
13003+
APInt DemandedMask = APInt::getLowBitsSet(32, 16);
13004+
const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
13005+
if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
13006+
return SDValue(N, 0);
13007+
13008+
return SDValue();
1303213009
}
1303313010

1303413011
static SDValue PerformVMOVrhCombine(SDNode *N,
@@ -16393,6 +16370,12 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
1639316370
assert(DstSz == Known.getBitWidth());
1639416371
break;
1639516372
}
16373+
case ARMISD::VMOVrh: {
16374+
KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
16375+
assert(KnownOp.getBitWidth() == 16);
16376+
Known = KnownOp.zext(32);
16377+
break;
16378+
}
1639616379
}
1639716380
}
1639816381

llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -220,14 +220,11 @@ entry:
220220
define arm_aapcs_vfpcc float @test_vminnmvq_f16(float %a.coerce, <8 x half> %b) {
221221
; CHECK-LABEL: test_vminnmvq_f16:
222222
; CHECK: @ %bb.0: @ %entry
223-
; CHECK-NEXT: sub sp, #4
224223
; CHECK-NEXT: vmov r0, s0
225224
; CHECK-NEXT: vminnmv.f16 r0, q1
226225
; CHECK-NEXT: vmov s0, r0
227-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
228-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
226+
; CHECK-NEXT: vmov.f16 r0, s0
229227
; CHECK-NEXT: vmov s0, r0
230-
; CHECK-NEXT: add sp, #4
231228
; CHECK-NEXT: bx lr
232229
entry:
233230
%0 = bitcast float %a.coerce to i32
@@ -255,14 +252,11 @@ entry:
255252
define arm_aapcs_vfpcc float @test_vminnmavq_f16(float %a.coerce, <8 x half> %b) {
256253
; CHECK-LABEL: test_vminnmavq_f16:
257254
; CHECK: @ %bb.0: @ %entry
258-
; CHECK-NEXT: sub sp, #4
259255
; CHECK-NEXT: vmov r0, s0
260256
; CHECK-NEXT: vminnmav.f16 r0, q1
261257
; CHECK-NEXT: vmov s0, r0
262-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
263-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
258+
; CHECK-NEXT: vmov.f16 r0, s0
264259
; CHECK-NEXT: vmov s0, r0
265-
; CHECK-NEXT: add sp, #4
266260
; CHECK-NEXT: bx lr
267261
entry:
268262
%0 = bitcast float %a.coerce to i32
@@ -290,14 +284,11 @@ entry:
290284
define arm_aapcs_vfpcc float @test_vmaxnmvq_f16(float %a.coerce, <8 x half> %b) {
291285
; CHECK-LABEL: test_vmaxnmvq_f16:
292286
; CHECK: @ %bb.0: @ %entry
293-
; CHECK-NEXT: sub sp, #4
294287
; CHECK-NEXT: vmov r0, s0
295288
; CHECK-NEXT: vmaxnmv.f16 r0, q1
296289
; CHECK-NEXT: vmov s0, r0
297-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
298-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
290+
; CHECK-NEXT: vmov.f16 r0, s0
299291
; CHECK-NEXT: vmov s0, r0
300-
; CHECK-NEXT: add sp, #4
301292
; CHECK-NEXT: bx lr
302293
entry:
303294
%0 = bitcast float %a.coerce to i32
@@ -325,14 +316,11 @@ entry:
325316
define arm_aapcs_vfpcc float @test_vmaxnmavq_f16(float %a.coerce, <8 x half> %b) {
326317
; CHECK-LABEL: test_vmaxnmavq_f16:
327318
; CHECK: @ %bb.0: @ %entry
328-
; CHECK-NEXT: sub sp, #4
329319
; CHECK-NEXT: vmov r0, s0
330320
; CHECK-NEXT: vmaxnmav.f16 r0, q1
331321
; CHECK-NEXT: vmov s0, r0
332-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
333-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
322+
; CHECK-NEXT: vmov.f16 r0, s0
334323
; CHECK-NEXT: vmov s0, r0
335-
; CHECK-NEXT: add sp, #4
336324
; CHECK-NEXT: bx lr
337325
entry:
338326
%0 = bitcast float %a.coerce to i32
@@ -648,16 +636,13 @@ entry:
648636
define arm_aapcs_vfpcc float @test_vminnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
649637
; CHECK-LABEL: test_vminnmvq_p_f16:
650638
; CHECK: @ %bb.0: @ %entry
651-
; CHECK-NEXT: sub sp, #4
652639
; CHECK-NEXT: vmov r1, s0
653640
; CHECK-NEXT: vmsr p0, r0
654641
; CHECK-NEXT: vpst
655642
; CHECK-NEXT: vminnmvt.f16 r1, q1
656643
; CHECK-NEXT: vmov s0, r1
657-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
658-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
644+
; CHECK-NEXT: vmov.f16 r0, s0
659645
; CHECK-NEXT: vmov s0, r0
660-
; CHECK-NEXT: add sp, #4
661646
; CHECK-NEXT: bx lr
662647
entry:
663648
%0 = bitcast float %a.coerce to i32
@@ -691,16 +676,13 @@ entry:
691676
define arm_aapcs_vfpcc float @test_vminnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
692677
; CHECK-LABEL: test_vminnmavq_p_f16:
693678
; CHECK: @ %bb.0: @ %entry
694-
; CHECK-NEXT: sub sp, #4
695679
; CHECK-NEXT: vmov r1, s0
696680
; CHECK-NEXT: vmsr p0, r0
697681
; CHECK-NEXT: vpst
698682
; CHECK-NEXT: vminnmavt.f16 r1, q1
699683
; CHECK-NEXT: vmov s0, r1
700-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
701-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
684+
; CHECK-NEXT: vmov.f16 r0, s0
702685
; CHECK-NEXT: vmov s0, r0
703-
; CHECK-NEXT: add sp, #4
704686
; CHECK-NEXT: bx lr
705687
entry:
706688
%0 = bitcast float %a.coerce to i32
@@ -734,16 +716,13 @@ entry:
734716
define arm_aapcs_vfpcc float @test_vmaxnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
735717
; CHECK-LABEL: test_vmaxnmvq_p_f16:
736718
; CHECK: @ %bb.0: @ %entry
737-
; CHECK-NEXT: sub sp, #4
738719
; CHECK-NEXT: vmov r1, s0
739720
; CHECK-NEXT: vmsr p0, r0
740721
; CHECK-NEXT: vpst
741722
; CHECK-NEXT: vmaxnmvt.f16 r1, q1
742723
; CHECK-NEXT: vmov s0, r1
743-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
744-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
724+
; CHECK-NEXT: vmov.f16 r0, s0
745725
; CHECK-NEXT: vmov s0, r0
746-
; CHECK-NEXT: add sp, #4
747726
; CHECK-NEXT: bx lr
748727
entry:
749728
%0 = bitcast float %a.coerce to i32
@@ -777,16 +756,13 @@ entry:
777756
define arm_aapcs_vfpcc float @test_vmaxnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
778757
; CHECK-LABEL: test_vmaxnmavq_p_f16:
779758
; CHECK: @ %bb.0: @ %entry
780-
; CHECK-NEXT: sub sp, #4
781759
; CHECK-NEXT: vmov r1, s0
782760
; CHECK-NEXT: vmsr p0, r0
783761
; CHECK-NEXT: vpst
784762
; CHECK-NEXT: vmaxnmavt.f16 r1, q1
785763
; CHECK-NEXT: vmov s0, r1
786-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
787-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
764+
; CHECK-NEXT: vmov.f16 r0, s0
788765
; CHECK-NEXT: vmov s0, r0
789-
; CHECK-NEXT: add sp, #4
790766
; CHECK-NEXT: bx lr
791767
entry:
792768
%0 = bitcast float %a.coerce to i32

llvm/test/CodeGen/Thumb2/mve-vdup.ll

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,11 @@ entry:
127127
define arm_aapcs_vfpcc <8 x half> @vdup_f16_bc(half* %src1, half* %src2) {
128128
; CHECK-LABEL: vdup_f16_bc:
129129
; CHECK: @ %bb.0: @ %entry
130-
; CHECK-NEXT: .pad #4
131-
; CHECK-NEXT: sub sp, #4
132130
; CHECK-NEXT: vldr.16 s0, [r1]
133131
; CHECK-NEXT: vldr.16 s2, [r0]
134132
; CHECK-NEXT: vadd.f16 s0, s2, s0
135-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
136-
; CHECK-NEXT: ldrh.w r0, [sp, #2]
133+
; CHECK-NEXT: vmov.f16 r0, s0
137134
; CHECK-NEXT: vdup.16 q0, r0
138-
; CHECK-NEXT: add sp, #4
139135
; CHECK-NEXT: bx lr
140136
entry:
141137
%0 = load half, half *%src1, align 2
@@ -260,16 +256,12 @@ entry:
260256
define arm_aapcs_vfpcc half @vdup_f16_extract(half* %src1, half* %src2) {
261257
; CHECK-LABEL: vdup_f16_extract:
262258
; CHECK: @ %bb.0: @ %entry
263-
; CHECK-NEXT: .pad #4
264-
; CHECK-NEXT: sub sp, #4
265259
; CHECK-NEXT: vldr.16 s0, [r2]
266260
; CHECK-NEXT: vldr.16 s2, [r1]
267261
; CHECK-NEXT: vadd.f16 s0, s2, s0
268-
; CHECK-NEXT: vstr.16 s0, [sp, #2]
269-
; CHECK-NEXT: ldrh.w r1, [sp, #2]
262+
; CHECK-NEXT: vmov.f16 r1, s0
270263
; CHECK-NEXT: vdup.16 q0, r1
271264
; CHECK-NEXT: vstr.16 s1, [r0]
272-
; CHECK-NEXT: add sp, #4
273265
; CHECK-NEXT: bx lr
274266
entry:
275267
%0 = load half, half *%src1, align 2

0 commit comments

Comments
 (0)