Skip to content

Commit d9e9479

Browse files
committed
Merging r367412 and r367429:
------------------------------------------------------------------------ r367412 | rksimon | 2019-07-31 13:35:01 +0200 (Wed, 31 Jul 2019) | 1 line [X86][AVX] Add reduced test case for PR42833 ------------------------------------------------------------------------ ------------------------------------------------------------------------ r367429 | rksimon | 2019-07-31 14:55:39 +0200 (Wed, 31 Jul 2019) | 3 lines [X86][AVX] Ensure chained subvector insertions are the same size (PR42833) Before combining insert_subvector(insert_subvector(vec, sub0, c0), sub1, c1) patterns, ensure that the subvectors are all the same type. On AVX512 targets especially we might have a mixture of 128/256 subvector insertions. ------------------------------------------------------------------------ llvm-svn: 369362
1 parent 90dc09f commit d9e9479

File tree

2 files changed

+238
-0
lines changed

2 files changed

+238
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5505,6 +5505,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
55055505
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
55065506
Idx == (VT.getVectorNumElements() / 2) &&
55075507
Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5508+
Src.getOperand(1).getValueType() == SubVT &&
55085509
isNullConstant(Src.getOperand(2))) {
55095510
Ops.push_back(Src.getOperand(1));
55105511
Ops.push_back(Sub);
@@ -43840,6 +43841,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
4384043841
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
4384143842
OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
4384243843
isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
43844+
Vec.getOperand(1).getValueSizeInBits() == SubVecVT.getSizeInBits() &&
4384343845
Vec.hasOneUse()) {
4384443846
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
4384543847
Vec.getOperand(1), Vec.getOperand(2));

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,239 @@ define <16 x i32> @PR42819(<8 x i32>* %a0) {
190190
%3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
191191
ret <16 x i32> %3
192192
}
193+
194+
@b = dso_local local_unnamed_addr global i32 0, align 4
195+
@c = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
196+
@d = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
197+
198+
define void @PR42833() {
199+
; SSE2-LABEL: PR42833:
200+
; SSE2: # %bb.0:
201+
; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm1
202+
; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm0
203+
; SSE2-NEXT: movd %xmm0, %eax
204+
; SSE2-NEXT: addl {{.*}}(%rip), %eax
205+
; SSE2-NEXT: movd %eax, %xmm2
206+
; SSE2-NEXT: movaps {{.*#+}} xmm3 = <u,1,1,1>
207+
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3]
208+
; SSE2-NEXT: movdqa %xmm0, %xmm4
209+
; SSE2-NEXT: paddd %xmm3, %xmm4
210+
; SSE2-NEXT: pslld $23, %xmm3
211+
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm3
212+
; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
213+
; SSE2-NEXT: movdqa %xmm0, %xmm5
214+
; SSE2-NEXT: pmuludq %xmm3, %xmm5
215+
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
216+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
217+
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
218+
; SSE2-NEXT: pmuludq %xmm3, %xmm6
219+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
220+
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
221+
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3]
222+
; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm3
223+
; SSE2-NEXT: psubd %xmm1, %xmm3
224+
; SSE2-NEXT: paddd %xmm1, %xmm1
225+
; SSE2-NEXT: movdqa %xmm1, c+{{.*}}(%rip)
226+
; SSE2-NEXT: movaps %xmm5, c+{{.*}}(%rip)
227+
; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm1
228+
; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm4
229+
; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm5
230+
; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm6
231+
; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm7
232+
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
233+
; SSE2-NEXT: psubd %xmm0, %xmm7
234+
; SSE2-NEXT: psubd %xmm4, %xmm6
235+
; SSE2-NEXT: psubd %xmm1, %xmm5
236+
; SSE2-NEXT: movdqa %xmm5, d+{{.*}}(%rip)
237+
; SSE2-NEXT: movdqa %xmm6, d+{{.*}}(%rip)
238+
; SSE2-NEXT: movdqa %xmm3, d+{{.*}}(%rip)
239+
; SSE2-NEXT: movdqa %xmm7, d+{{.*}}(%rip)
240+
; SSE2-NEXT: paddd %xmm4, %xmm4
241+
; SSE2-NEXT: paddd %xmm1, %xmm1
242+
; SSE2-NEXT: movdqa %xmm1, c+{{.*}}(%rip)
243+
; SSE2-NEXT: movdqa %xmm4, c+{{.*}}(%rip)
244+
; SSE2-NEXT: retq
245+
;
246+
; SSE42-LABEL: PR42833:
247+
; SSE42: # %bb.0:
248+
; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm1
249+
; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm0
250+
; SSE42-NEXT: movd %xmm0, %eax
251+
; SSE42-NEXT: addl {{.*}}(%rip), %eax
252+
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = <u,1,1,1>
253+
; SSE42-NEXT: pinsrd $0, %eax, %xmm2
254+
; SSE42-NEXT: movdqa %xmm0, %xmm3
255+
; SSE42-NEXT: paddd %xmm2, %xmm3
256+
; SSE42-NEXT: pslld $23, %xmm2
257+
; SSE42-NEXT: paddd {{.*}}(%rip), %xmm2
258+
; SSE42-NEXT: cvttps2dq %xmm2, %xmm2
259+
; SSE42-NEXT: pmulld %xmm0, %xmm2
260+
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
261+
; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm3
262+
; SSE42-NEXT: psubd %xmm1, %xmm3
263+
; SSE42-NEXT: paddd %xmm1, %xmm1
264+
; SSE42-NEXT: movdqa %xmm1, c+{{.*}}(%rip)
265+
; SSE42-NEXT: movdqa %xmm2, c+{{.*}}(%rip)
266+
; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm1
267+
; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm2
268+
; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm4
269+
; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm5
270+
; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm6
271+
; SSE42-NEXT: pinsrd $0, %eax, %xmm0
272+
; SSE42-NEXT: psubd %xmm0, %xmm6
273+
; SSE42-NEXT: psubd %xmm2, %xmm5
274+
; SSE42-NEXT: psubd %xmm1, %xmm4
275+
; SSE42-NEXT: movdqa %xmm4, d+{{.*}}(%rip)
276+
; SSE42-NEXT: movdqa %xmm5, d+{{.*}}(%rip)
277+
; SSE42-NEXT: movdqa %xmm3, d+{{.*}}(%rip)
278+
; SSE42-NEXT: movdqa %xmm6, d+{{.*}}(%rip)
279+
; SSE42-NEXT: paddd %xmm2, %xmm2
280+
; SSE42-NEXT: paddd %xmm1, %xmm1
281+
; SSE42-NEXT: movdqa %xmm1, c+{{.*}}(%rip)
282+
; SSE42-NEXT: movdqa %xmm2, c+{{.*}}(%rip)
283+
; SSE42-NEXT: retq
284+
;
285+
; AVX1-LABEL: PR42833:
286+
; AVX1: # %bb.0:
287+
; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm0
288+
; AVX1-NEXT: vmovd %xmm0, %eax
289+
; AVX1-NEXT: addl {{.*}}(%rip), %eax
290+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,1,1,1>
291+
; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
292+
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
293+
; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3
294+
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
295+
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
296+
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
297+
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1
298+
; AVX1-NEXT: vpslld $1, %xmm3, %xmm3
299+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
300+
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7]
301+
; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm2
302+
; AVX1-NEXT: vpsubd c+{{.*}}(%rip), %xmm2, %xmm2
303+
; AVX1-NEXT: vmovups %ymm1, c+{{.*}}(%rip)
304+
; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
305+
; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1
306+
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
307+
; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1
308+
; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3
309+
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
310+
; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm4
311+
; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm5
312+
; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4
313+
; AVX1-NEXT: vmovdqa %xmm2, d+{{.*}}(%rip)
314+
; AVX1-NEXT: vmovdqa %xmm4, d+{{.*}}(%rip)
315+
; AVX1-NEXT: vmovdqa %xmm1, d+{{.*}}(%rip)
316+
; AVX1-NEXT: vmovdqa %xmm0, d+{{.*}}(%rip)
317+
; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0
318+
; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1
319+
; AVX1-NEXT: vmovdqa %xmm1, c+{{.*}}(%rip)
320+
; AVX1-NEXT: vmovdqa %xmm0, c+{{.*}}(%rip)
321+
; AVX1-NEXT: vzeroupper
322+
; AVX1-NEXT: retq
323+
;
324+
; AVX2-LABEL: PR42833:
325+
; AVX2: # %bb.0:
326+
; AVX2-NEXT: movl {{.*}}(%rip), %eax
327+
; AVX2-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0
328+
; AVX2-NEXT: addl c+{{.*}}(%rip), %eax
329+
; AVX2-NEXT: vmovd %eax, %xmm1
330+
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],mem[1,2,3,4,5,6,7]
331+
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm3
332+
; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm2
333+
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7]
334+
; AVX2-NEXT: vmovdqu %ymm2, c+{{.*}}(%rip)
335+
; AVX2-NEXT: vmovdqu c+{{.*}}(%rip), %ymm2
336+
; AVX2-NEXT: vmovdqu d+{{.*}}(%rip), %ymm3
337+
; AVX2-NEXT: vmovdqu d+{{.*}}(%rip), %ymm4
338+
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
339+
; AVX2-NEXT: vpsubd %ymm0, %ymm4, %ymm0
340+
; AVX2-NEXT: vpsubd %ymm2, %ymm3, %ymm1
341+
; AVX2-NEXT: vmovdqu %ymm1, d+{{.*}}(%rip)
342+
; AVX2-NEXT: vmovdqu %ymm0, d+{{.*}}(%rip)
343+
; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm0
344+
; AVX2-NEXT: vmovdqu %ymm0, c+{{.*}}(%rip)
345+
; AVX2-NEXT: vzeroupper
346+
; AVX2-NEXT: retq
347+
;
348+
; AVX512-LABEL: PR42833:
349+
; AVX512: # %bb.0:
350+
; AVX512-NEXT: movl {{.*}}(%rip), %eax
351+
; AVX512-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0
352+
; AVX512-NEXT: vmovdqu64 c+{{.*}}(%rip), %zmm1
353+
; AVX512-NEXT: addl c+{{.*}}(%rip), %eax
354+
; AVX512-NEXT: vmovd %eax, %xmm2
355+
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],mem[1,2,3,4,5,6,7]
356+
; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm3
357+
; AVX512-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
358+
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7]
359+
; AVX512-NEXT: vmovdqa c+{{.*}}(%rip), %xmm2
360+
; AVX512-NEXT: vmovdqu %ymm0, c+{{.*}}(%rip)
361+
; AVX512-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0
362+
; AVX512-NEXT: vmovdqu64 d+{{.*}}(%rip), %zmm3
363+
; AVX512-NEXT: vpinsrd $0, %eax, %xmm2, %xmm2
364+
; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
365+
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1
366+
; AVX512-NEXT: vpsubd %zmm1, %zmm3, %zmm1
367+
; AVX512-NEXT: vmovdqu64 %zmm1, d+{{.*}}(%rip)
368+
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0
369+
; AVX512-NEXT: vmovdqu %ymm0, c+{{.*}}(%rip)
370+
; AVX512-NEXT: vzeroupper
371+
; AVX512-NEXT: retq
372+
;
373+
; XOP-LABEL: PR42833:
374+
; XOP: # %bb.0:
375+
; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm0
376+
; XOP-NEXT: vmovd %xmm0, %eax
377+
; XOP-NEXT: addl {{.*}}(%rip), %eax
378+
; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = <u,1,1,1>
379+
; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
380+
; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm2
381+
; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3
382+
; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm1
383+
; XOP-NEXT: vpslld $1, %xmm3, %xmm3
384+
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
385+
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7]
386+
; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm2
387+
; XOP-NEXT: vpsubd c+{{.*}}(%rip), %xmm2, %xmm2
388+
; XOP-NEXT: vmovups %ymm1, c+{{.*}}(%rip)
389+
; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
390+
; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1
391+
; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
392+
; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1
393+
; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3
394+
; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1
395+
; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm4
396+
; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm5
397+
; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4
398+
; XOP-NEXT: vmovdqa %xmm2, d+{{.*}}(%rip)
399+
; XOP-NEXT: vmovdqa %xmm4, d+{{.*}}(%rip)
400+
; XOP-NEXT: vmovdqa %xmm1, d+{{.*}}(%rip)
401+
; XOP-NEXT: vmovdqa %xmm0, d+{{.*}}(%rip)
402+
; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0
403+
; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1
404+
; XOP-NEXT: vmovdqa %xmm1, c+{{.*}}(%rip)
405+
; XOP-NEXT: vmovdqa %xmm0, c+{{.*}}(%rip)
406+
; XOP-NEXT: vzeroupper
407+
; XOP-NEXT: retq
408+
%1 = load i32, i32* @b, align 4
409+
%2 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
410+
%3 = shufflevector <8 x i32> %2, <8 x i32> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
411+
%4 = extractelement <8 x i32> %2, i32 0
412+
%5 = add i32 %1, %4
413+
%6 = insertelement <8 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %5, i32 0
414+
%7 = add <8 x i32> %2, %6
415+
%8 = shl <8 x i32> %2, %6
416+
%9 = shufflevector <8 x i32> %7, <8 x i32> %8, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
417+
store <8 x i32> %9, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
418+
%10 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16
419+
%11 = shufflevector <8 x i32> %10, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
420+
%12 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
421+
%13 = insertelement <16 x i32> %3, i32 %5, i32 0
422+
%14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
423+
%15 = sub <16 x i32> %12, %14
424+
store <16 x i32> %15, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
425+
%16 = shl <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
426+
store <8 x i32> %16, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16
427+
ret void
428+
}

0 commit comments

Comments
 (0)