Skip to content

Commit 5cbaa56

Browse files
committed
Merging r370592:
------------------------------------------------------------------------ r370592 | rksimon | 2019-08-31 18:21:31 +0200 (Sat, 31 Aug 2019) | 3 lines [X86] EltsFromConsecutiveLoads - Don't confuse elt count with vector element count (PR43170) EltsFromConsecutiveLoads was assuming that the number of input elts was the same as the number of elements in the output vector type when creating a zeroing shuffle, causing an assert when subvectors were being combined instead of just scalars. ------------------------------------------------------------------------ llvm-svn: 371382
1 parent b508b4b commit 5cbaa56

File tree

2 files changed

+54
-11
lines changed

2 files changed

+54
-11
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7650,17 +7650,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
76507650
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
76517651
// vector and a zero vector to clear out the zero elements.
76527652
if (!isAfterLegalize && VT.isVector()) {
7653-
SmallVector<int, 4> ClearMask(NumElems, -1);
7654-
for (unsigned i = 0; i < NumElems; ++i) {
7655-
if (ZeroMask[i])
7656-
ClearMask[i] = i + NumElems;
7657-
else if (LoadMask[i])
7658-
ClearMask[i] = i;
7659-
}
7660-
SDValue V = CreateLoad(VT, LDBase);
7661-
SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7662-
: DAG.getConstantFP(0.0, DL, VT);
7663-
return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7653+
unsigned NumMaskElts = VT.getVectorNumElements();
7654+
if ((NumMaskElts % NumElems) == 0) {
7655+
unsigned Scale = NumMaskElts / NumElems;
7656+
SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7657+
for (unsigned i = 0; i < NumElems; ++i) {
7658+
if (UndefMask[i])
7659+
continue;
7660+
int Offset = ZeroMask[i] ? NumMaskElts : 0;
7661+
for (unsigned j = 0; j != Scale; ++j)
7662+
ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7663+
}
7664+
SDValue V = CreateLoad(VT, LDBase);
7665+
SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7666+
: DAG.getConstantFP(0.0, DL, VT);
7667+
return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7668+
}
76647669
}
76657670
}
76667671

llvm/test/CodeGen/X86/vector-shuffle-avx512.ll

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,3 +936,41 @@ define <16 x float> @test_masked_permps_v16f32(<16 x float>* %vp, <16 x float> %
936936
%res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec2
937937
ret <16 x float> %res
938938
}
939+
940+
%union1= type { <16 x float> }
941+
@src1 = external dso_local local_unnamed_addr global %union1, align 64
942+
943+
define void @PR43170(<16 x float>* %a0) {
944+
; SKX64-LABEL: PR43170:
945+
; SKX64: # %bb.0: # %entry
946+
; SKX64-NEXT: vmovaps {{.*}}(%rip), %ymm0
947+
; SKX64-NEXT: vmovaps %zmm0, (%rdi)
948+
; SKX64-NEXT: vzeroupper
949+
; SKX64-NEXT: retq
950+
;
951+
; KNL64-LABEL: PR43170:
952+
; KNL64: # %bb.0: # %entry
953+
; KNL64-NEXT: vmovaps {{.*}}(%rip), %ymm0
954+
; KNL64-NEXT: vmovaps %zmm0, (%rdi)
955+
; KNL64-NEXT: retq
956+
;
957+
; SKX32-LABEL: PR43170:
958+
; SKX32: # %bb.0: # %entry
959+
; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax
960+
; SKX32-NEXT: vmovaps src1, %ymm0
961+
; SKX32-NEXT: vmovaps %zmm0, (%eax)
962+
; SKX32-NEXT: vzeroupper
963+
; SKX32-NEXT: retl
964+
;
965+
; KNL32-LABEL: PR43170:
966+
; KNL32: # %bb.0: # %entry
967+
; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax
968+
; KNL32-NEXT: vmovaps src1, %ymm0
969+
; KNL32-NEXT: vmovaps %zmm0, (%eax)
970+
; KNL32-NEXT: retl
971+
entry:
972+
%0 = load <8 x float>, <8 x float>* bitcast (%union1* @src1 to <8 x float>*), align 64
973+
%1 = shufflevector <8 x float> %0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
974+
store <16 x float> %1, <16 x float>* %a0, align 64
975+
ret void
976+
}

0 commit comments

Comments
 (0)