Skip to content

Commit bbf4af3

Browse files
committed
[X86][SSE] Remove XFormVExtractWithShuffleIntoLoad to prevent legalization infinite loops (PR43971)
As detailed in PR43971/D70267, the use of XFormVExtractWithShuffleIntoLoad causes issues where we end up in infinite loops of extract(targetshuffle(vecload)) -> extract(shuffle(vecload)) -> extract(vecload) -> extract(targetshuffle(vecload)), there are just too many legalization checks at every stage that we can't guarantee that extract(shuffle(vecload)) -> scalarload can occur. At the moment we see a number of minor regressions as we don't fold extract(shuffle(vecload)) -> scalarload before legal ops, these can be addressed in future patches and extension of X86ISelLowering's combineExtractWithShuffle.
1 parent 96d814a commit bbf4af3

File tree

5 files changed

+83
-139
lines changed

5 files changed

+83
-139
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -35126,123 +35126,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
3512635126
Op, DemandedBits, DemandedElts, DAG, Depth);
3512735127
}
3512835128

35129-
/// Check if a vector extract from a target-specific shuffle of a load can be
35130-
/// folded into a single element load.
35131-
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
35132-
/// shuffles have been custom lowered so we need to handle those here.
35133-
static SDValue
35134-
XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
35135-
TargetLowering::DAGCombinerInfo &DCI) {
35136-
if (DCI.isBeforeLegalizeOps())
35137-
return SDValue();
35138-
35139-
SDValue InVec = N->getOperand(0);
35140-
SDValue EltNo = N->getOperand(1);
35141-
EVT EltVT = N->getValueType(0);
35142-
35143-
if (!isa<ConstantSDNode>(EltNo))
35144-
return SDValue();
35145-
35146-
EVT OriginalVT = InVec.getValueType();
35147-
unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
35148-
35149-
// Peek through bitcasts, don't duplicate a load with other uses.
35150-
InVec = peekThroughOneUseBitcasts(InVec);
35151-
35152-
EVT CurrentVT = InVec.getValueType();
35153-
if (!CurrentVT.isVector())
35154-
return SDValue();
35155-
35156-
unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
35157-
if ((NumOriginalElts % NumCurrentElts) != 0)
35158-
return SDValue();
35159-
35160-
if (!isTargetShuffle(InVec.getOpcode()))
35161-
return SDValue();
35162-
35163-
// Don't duplicate a load with other uses.
35164-
if (!InVec.hasOneUse())
35165-
return SDValue();
35166-
35167-
SmallVector<int, 16> ShuffleMask;
35168-
SmallVector<SDValue, 2> ShuffleOps;
35169-
bool UnaryShuffle;
35170-
if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
35171-
ShuffleOps, ShuffleMask, UnaryShuffle))
35172-
return SDValue();
35173-
35174-
unsigned Scale = NumOriginalElts / NumCurrentElts;
35175-
if (Scale > 1) {
35176-
SmallVector<int, 16> ScaledMask;
35177-
scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
35178-
ShuffleMask = std::move(ScaledMask);
35179-
}
35180-
assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");
35181-
35182-
// Select the input vector, guarding against out of range extract vector.
35183-
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
35184-
int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
35185-
35186-
if (Idx == SM_SentinelZero)
35187-
return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
35188-
: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
35189-
if (Idx == SM_SentinelUndef)
35190-
return DAG.getUNDEF(EltVT);
35191-
35192-
// Bail if any mask element is SM_SentinelZero - getVectorShuffle below
35193-
// won't handle it.
35194-
if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
35195-
return SDValue();
35196-
35197-
assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
35198-
"Shuffle index out of range");
35199-
SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
35200-
35201-
// If inputs to shuffle are the same for both ops, then allow 2 uses
35202-
unsigned AllowedUses =
35203-
(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
35204-
35205-
if (LdNode.getOpcode() == ISD::BITCAST) {
35206-
// Don't duplicate a load with other uses.
35207-
if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
35208-
return SDValue();
35209-
35210-
AllowedUses = 1; // only allow 1 load use if we have a bitcast
35211-
LdNode = LdNode.getOperand(0);
35212-
}
35213-
35214-
if (!ISD::isNormalLoad(LdNode.getNode()))
35215-
return SDValue();
35216-
35217-
LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
35218-
35219-
if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
35220-
return SDValue();
35221-
35222-
// If there's a bitcast before the shuffle, check if the load type and
35223-
// alignment is valid.
35224-
unsigned Align = LN0->getAlignment();
35225-
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35226-
unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
35227-
EltVT.getTypeForEVT(*DAG.getContext()));
35228-
35229-
if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
35230-
return SDValue();
35231-
35232-
// All checks match so transform back to vector_shuffle so that DAG combiner
35233-
// can finish the job
35234-
SDLoc dl(N);
35235-
35236-
// Create shuffle node taking into account the case that its a unary shuffle
35237-
SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
35238-
: DAG.getBitcast(OriginalVT, ShuffleOps[1]);
35239-
Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
35240-
DAG.getBitcast(OriginalVT, ShuffleOps[0]),
35241-
Shuffle, ShuffleMask);
35242-
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
35243-
EltNo);
35244-
}
35245-
3524635129
// Helper to peek through bitops/setcc to determine size of source vector.
3524735130
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
3524835131
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
@@ -36496,14 +36379,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
3649636379
}
3649736380

3649836381
// TODO - Remove this once we can handle the implicit zero-extension of
36499-
// X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
36500-
// combineHorizontalPredicateResult and combineBasicSADPattern.
36382+
// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
36383+
// combineBasicSADPattern.
3650136384
return SDValue();
3650236385
}
3650336386

36504-
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
36505-
return NewOp;
36506-
3650736387
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
3650836388
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
3650936389
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {

llvm/test/CodeGen/X86/2011-05-09-loaduse.ll

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,21 @@
55
define float @test(<4 x float>* %A) nounwind {
66
; X86-LABEL: test:
77
; X86: # %bb.0: # %entry
8+
; X86-NEXT: pushl %eax
89
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
9-
; X86-NEXT: xorps %xmm0, %xmm0
10-
; X86-NEXT: flds 12(%eax)
11-
; X86-NEXT: movaps %xmm0, (%eax)
10+
; X86-NEXT: movaps (%eax), %xmm0
11+
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
12+
; X86-NEXT: xorps %xmm1, %xmm1
13+
; X86-NEXT: movaps %xmm1, (%eax)
14+
; X86-NEXT: movss %xmm0, (%esp)
15+
; X86-NEXT: flds (%esp)
16+
; X86-NEXT: popl %eax
1217
; X86-NEXT: retl
1318
;
1419
; X64-LABEL: test:
1520
; X64: # %bb.0: # %entry
16-
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
21+
; X64-NEXT: movaps (%rdi), %xmm0
22+
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1723
; X64-NEXT: xorps %xmm1, %xmm1
1824
; X64-NEXT: movaps %xmm1, (%rdi)
1925
; X64-NEXT: retq

llvm/test/CodeGen/X86/extractelement-load.ll

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,20 @@ define i32 @t(<2 x i64>* %val) nounwind {
99
; X32-SSE2-LABEL: t:
1010
; X32-SSE2: # %bb.0:
1111
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
12-
; X32-SSE2-NEXT: movl 8(%eax), %eax
12+
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
13+
; X32-SSE2-NEXT: movd %xmm0, %eax
1314
; X32-SSE2-NEXT: retl
1415
;
15-
; X64-LABEL: t:
16-
; X64: # %bb.0:
17-
; X64-NEXT: movl 8(%rdi), %eax
18-
; X64-NEXT: retq
16+
; X64-SSSE3-LABEL: t:
17+
; X64-SSSE3: # %bb.0:
18+
; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
19+
; X64-SSSE3-NEXT: movd %xmm0, %eax
20+
; X64-SSSE3-NEXT: retq
21+
;
22+
; X64-AVX-LABEL: t:
23+
; X64-AVX: # %bb.0:
24+
; X64-AVX-NEXT: movl 8(%rdi), %eax
25+
; X64-AVX-NEXT: retq
1926
%tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
2027
%tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1]
2128
%tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1]
@@ -127,7 +134,8 @@ define float @t6(<8 x float> *%a0) {
127134
; X32-SSE2-NEXT: pushl %eax
128135
; X32-SSE2-NEXT: .cfi_def_cfa_offset 8
129136
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
130-
; X32-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
137+
; X32-SSE2-NEXT: movaps (%eax), %xmm0
138+
; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
131139
; X32-SSE2-NEXT: xorps %xmm1, %xmm1
132140
; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1
133141
; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -142,7 +150,7 @@ define float @t6(<8 x float> *%a0) {
142150
;
143151
; X64-SSSE3-LABEL: t6:
144152
; X64-SSSE3: # %bb.0:
145-
; X64-SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
153+
; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
146154
; X64-SSSE3-NEXT: xorps %xmm0, %xmm0
147155
; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0
148156
; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -165,3 +173,50 @@ define float @t6(<8 x float> *%a0) {
165173
%cond = select i1 %cmp, float 1.000000e+00, float %vecext
166174
ret float %cond
167175
}
176+
177+
define void @PR43971(<8 x float> *%a0, float *%a1) {
178+
; X32-SSE2-LABEL: PR43971:
179+
; X32-SSE2: # %bb.0: # %entry
180+
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
181+
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
182+
; X32-SSE2-NEXT: movaps 16(%ecx), %xmm0
183+
; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
184+
; X32-SSE2-NEXT: xorps %xmm1, %xmm1
185+
; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1
186+
; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
187+
; X32-SSE2-NEXT: andps %xmm1, %xmm2
188+
; X32-SSE2-NEXT: andnps %xmm0, %xmm1
189+
; X32-SSE2-NEXT: orps %xmm2, %xmm1
190+
; X32-SSE2-NEXT: movss %xmm1, (%eax)
191+
; X32-SSE2-NEXT: retl
192+
;
193+
; X64-SSSE3-LABEL: PR43971:
194+
; X64-SSSE3: # %bb.0: # %entry
195+
; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
196+
; X64-SSSE3-NEXT: xorps %xmm1, %xmm1
197+
; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1
198+
; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
199+
; X64-SSSE3-NEXT: andps %xmm1, %xmm2
200+
; X64-SSSE3-NEXT: andnps %xmm0, %xmm1
201+
; X64-SSSE3-NEXT: orps %xmm2, %xmm1
202+
; X64-SSSE3-NEXT: movss %xmm1, (%rsi)
203+
; X64-SSSE3-NEXT: retq
204+
;
205+
; X64-AVX-LABEL: PR43971:
206+
; X64-AVX: # %bb.0: # %entry
207+
; X64-AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
208+
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
209+
; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1
210+
; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
211+
; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
212+
; X64-AVX-NEXT: vmovss %xmm0, (%rsi)
213+
; X64-AVX-NEXT: retq
214+
entry:
215+
%0 = load <8 x float>, <8 x float>* %a0, align 32
216+
%vecext = extractelement <8 x float> %0, i32 6
217+
%cmp = fcmp ogt float %vecext, 0.000000e+00
218+
%1 = load float, float* %a1, align 4
219+
%cond = select i1 %cmp, float %1, float %vecext
220+
store float %cond, float* %a1, align 4
221+
ret void
222+
}

llvm/test/CodeGen/X86/insertps-combine.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -269,12 +269,12 @@ define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
269269
define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
270270
; SSE-LABEL: extract_lane_insertps_5123:
271271
; SSE: # %bb.0:
272-
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
272+
; SSE-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3]
273273
; SSE-NEXT: retq
274274
;
275275
; AVX-LABEL: extract_lane_insertps_5123:
276276
; AVX: # %bb.0:
277-
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
277+
; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = mem[1,1,3,3]
278278
; AVX-NEXT: retq
279279
%a1 = load <4 x float>, <4 x float> *%p1
280280
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 64)
@@ -285,12 +285,13 @@ define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
285285
define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) {
286286
; SSE-LABEL: extract_lane_insertps_6123:
287287
; SSE: # %bb.0:
288-
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
288+
; SSE-NEXT: movaps (%rdi), %xmm0
289+
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
289290
; SSE-NEXT: retq
290291
;
291292
; AVX-LABEL: extract_lane_insertps_6123:
292293
; AVX: # %bb.0:
293-
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
294+
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
294295
; AVX-NEXT: retq
295296
%a1 = load <4 x float>, <4 x float> *%p1
296297
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)

llvm/test/CodeGen/X86/vec_extract.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,15 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind {
5757
; X32: # %bb.0: # %entry
5858
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
5959
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
60-
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
60+
; X32-NEXT: movaps (%ecx), %xmm0
61+
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
6162
; X32-NEXT: movss %xmm0, (%eax)
6263
; X32-NEXT: retl
6364
;
6465
; X64-LABEL: test3:
6566
; X64: # %bb.0: # %entry
66-
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
67+
; X64-NEXT: movaps (%rsi), %xmm0
68+
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
6769
; X64-NEXT: movss %xmm0, (%rdi)
6870
; X64-NEXT: retq
6971
entry:

0 commit comments

Comments
 (0)