Skip to content

Commit c6e5531

Browse files
committed
[X86][AVX] Combine shuffles to TRUNCATE/VTRUNC patterns
Add support for combining shuffles to AVX512 truncate instructions - another step toward fixing D56387/D66004. It also fixes SKX code on PR31443. We could probably extend this further to handle non-VLX truncation cases.
1 parent 078c863 commit c6e5531

17 files changed

+708
-462
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7498,6 +7498,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
74987498
createPackShuffleMask(VT, Mask, IsUnary);
74997499
return true;
75007500
}
7501+
case ISD::TRUNCATE:
7502+
case X86ISD::VTRUNC: {
7503+
SDValue Src = N.getOperand(0);
7504+
MVT SrcVT = Src.getSimpleValueType();
7505+
unsigned NumSrcElts = SrcVT.getVectorNumElements();
7506+
unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7507+
unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7508+
assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
7509+
for (unsigned i = 0; i != NumSrcElts; ++i)
7510+
Mask.push_back(i * Scale);
7511+
Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7512+
Ops.push_back(Src);
7513+
return true;
7514+
}
75017515
case X86ISD::VSHLI:
75027516
case X86ISD::VSRLI: {
75037517
uint64_t ShiftVal = N.getConstantOperandVal(1);
@@ -11062,6 +11076,45 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
1106211076
return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
1106311077
}
1106411078

11079+
// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11080+
// source into the lower elements and zeroing the upper elements.
11081+
// TODO: Merge with matchShuffleAsVPMOV.
11082+
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11083+
ArrayRef<int> Mask, const APInt &Zeroable,
11084+
const X86Subtarget &Subtarget) {
11085+
if (!VT.is512BitVector() && !Subtarget.hasVLX())
11086+
return false;
11087+
11088+
unsigned NumElts = Mask.size();
11089+
unsigned EltSizeInBits = VT.getScalarSizeInBits();
11090+
unsigned MaxScale = 64 / EltSizeInBits;
11091+
11092+
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11093+
unsigned SrcEltBits = EltSizeInBits * Scale;
11094+
if (SrcEltBits < 32 && !Subtarget.hasBWI())
11095+
continue;
11096+
unsigned NumSrcElts = NumElts / Scale;
11097+
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11098+
continue;
11099+
unsigned UpperElts = NumElts - NumSrcElts;
11100+
if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11101+
continue;
11102+
SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11103+
SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11104+
DstVT = MVT::getIntegerVT(EltSizeInBits);
11105+
if ((NumSrcElts * EltSizeInBits) >= 128) {
11106+
// ISD::TRUNCATE
11107+
DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11108+
} else {
11109+
// X86ISD::VTRUNC
11110+
DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11111+
}
11112+
return true;
11113+
}
11114+
11115+
return false;
11116+
}
11117+
1106511118
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
1106611119
int Delta) {
1106711120
int Size = (int)Mask.size();
@@ -33192,11 +33245,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
3319233245
return VTBits;
3319333246

3319433247
case X86ISD::VTRUNC: {
33195-
// TODO: Add DemandedElts support.
3319633248
SDValue Src = Op.getOperand(0);
33197-
unsigned NumSrcBits = Src.getScalarValueSizeInBits();
33249+
MVT SrcVT = Src.getSimpleValueType();
33250+
unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
3319833251
assert(VTBits < NumSrcBits && "Illegal truncation input type");
33199-
unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
33252+
APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
33253+
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
3320033254
if (Tmp > (NumSrcBits - VTBits))
3320133255
return Tmp - (NumSrcBits - VTBits);
3320233256
return 1;
@@ -34094,6 +34148,43 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3409434148
}
3409534149
}
3409634150

34151+
// Match shuffle against TRUNCATE patterns.
34152+
if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
34153+
// Match against a VTRUNC instruction, accounting for src/dst sizes.
34154+
if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
34155+
Subtarget)) {
34156+
bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
34157+
ShuffleSrcVT.getVectorNumElements();
34158+
unsigned Opc = IsTRUNCATE ? ISD::TRUNCATE : X86ISD::VTRUNC;
34159+
if (Depth == 0 && Root.getOpcode() == Opc)
34160+
return SDValue(); // Nothing to do!
34161+
V1 = DAG.getBitcast(ShuffleSrcVT, V1);
34162+
Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
34163+
if (ShuffleVT.getSizeInBits() < RootSizeInBits)
34164+
Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
34165+
return DAG.getBitcast(RootVT, Res);
34166+
}
34167+
34168+
// Do we need a more general binary truncation pattern?
34169+
if (RootSizeInBits < 512 &&
34170+
((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
34171+
(RootVT.is128BitVector() && Subtarget.hasVLX())) &&
34172+
(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
34173+
isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
34174+
if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
34175+
return SDValue(); // Nothing to do!
34176+
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
34177+
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
34178+
V1 = DAG.getBitcast(ShuffleSrcVT, V1);
34179+
V2 = DAG.getBitcast(ShuffleSrcVT, V2);
34180+
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
34181+
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
34182+
Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
34183+
Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
34184+
return DAG.getBitcast(RootVT, Res);
34185+
}
34186+
}
34187+
3409734188
// Don't try to re-form single instruction chains under any circumstances now
3409834189
// that we've done encoding canonicalization for them.
3409934190
if (Depth < 1)

llvm/test/CodeGen/X86/avx512-trunc.ll

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,15 @@ define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
9191
}
9292

9393
define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
94-
; ALL-LABEL: trunc_qb_128:
95-
; ALL: ## %bb.0:
96-
; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
97-
; ALL-NEXT: retq
94+
; KNL-LABEL: trunc_qb_128:
95+
; KNL: ## %bb.0:
96+
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
97+
; KNL-NEXT: retq
98+
;
99+
; SKX-LABEL: trunc_qb_128:
100+
; SKX: ## %bb.0:
101+
; SKX-NEXT: vpmovqb %xmm0, %xmm0
102+
; SKX-NEXT: retq
98103
%x = trunc <2 x i64> %i to <2 x i8>
99104
ret <2 x i8> %x
100105
}
@@ -344,10 +349,15 @@ define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
344349
}
345350

346351
define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
347-
; ALL-LABEL: trunc_db_128:
348-
; ALL: ## %bb.0:
349-
; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
350-
; ALL-NEXT: retq
352+
; KNL-LABEL: trunc_db_128:
353+
; KNL: ## %bb.0:
354+
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
355+
; KNL-NEXT: retq
356+
;
357+
; SKX-LABEL: trunc_db_128:
358+
; SKX: ## %bb.0:
359+
; SKX-NEXT: vpmovdb %xmm0, %xmm0
360+
; SKX-NEXT: retq
351361
%x = trunc <4 x i32> %i to <4 x i8>
352362
ret <4 x i8> %x
353363
}
@@ -537,10 +547,15 @@ define <16 x i8> @trunc_wb_256_mem_and_ret(<16 x i16> %i, <16 x i8>* %res) #0 {
537547
}
538548

539549
define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
540-
; ALL-LABEL: trunc_wb_128:
541-
; ALL: ## %bb.0:
542-
; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
543-
; ALL-NEXT: retq
550+
; KNL-LABEL: trunc_wb_128:
551+
; KNL: ## %bb.0:
552+
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
553+
; KNL-NEXT: retq
554+
;
555+
; SKX-LABEL: trunc_wb_128:
556+
; SKX: ## %bb.0:
557+
; SKX-NEXT: vpmovwb %xmm0, %xmm0
558+
; SKX-NEXT: retq
544559
%x = trunc <8 x i16> %i to <8 x i8>
545560
ret <8 x i8> %x
546561
}

llvm/test/CodeGen/X86/avx512-vec-cmp.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,9 +1554,7 @@ define <8 x i64> @cmp_swap_bug(<16 x i8>* %x, <8 x i64> %y, <8 x i64> %z) {
15541554
; SKX-LABEL: cmp_swap_bug:
15551555
; SKX: ## %bb.0: ## %entry
15561556
; SKX-NEXT: vmovdqa (%rdi), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x17]
1557-
; SKX-NEXT: vpshufb {{.*}}(%rip), %xmm2, %xmm2 ## EVEX TO VEX Compression xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1558-
; SKX-NEXT: ## encoding: [0xc4,0xe2,0x69,0x00,0x15,A,A,A,A]
1559-
; SKX-NEXT: ## fixup A - offset: 5, value: LCPI69_0-4, kind: reloc_riprel_4byte
1557+
; SKX-NEXT: vpmovwb %xmm2, %xmm2 ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0xd2]
15601558
; SKX-NEXT: vpmovb2m %xmm2, %k1 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xca]
15611559
; SKX-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x64,0xc0]
15621560
; SKX-NEXT: retq ## encoding: [0xc3]

llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,7 @@ define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
736736
define <2 x i64> @test_mm_cvtepi16_epi8(<2 x i64> %__A) {
737737
; CHECK-LABEL: test_mm_cvtepi16_epi8:
738738
; CHECK: # %bb.0: # %entry
739-
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
739+
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
740740
; CHECK-NEXT: ret{{[l|q]}}
741741
entry:
742742
%0 = bitcast <2 x i64> %__A to <8 x i16>

llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3236,7 +3236,7 @@ entry:
32363236
define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
32373237
; CHECK-LABEL: test_mm_cvtepi32_epi8:
32383238
; CHECK: # %bb.0: # %entry
3239-
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3239+
; CHECK-NEXT: vpmovdb %xmm0, %xmm0
32403240
; CHECK-NEXT: ret{{[l|q]}}
32413241
entry:
32423242
%0 = bitcast <2 x i64> %__A to <4 x i32>
@@ -3249,7 +3249,7 @@ entry:
32493249
define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
32503250
; CHECK-LABEL: test_mm_cvtepi32_epi16:
32513251
; CHECK: # %bb.0: # %entry
3252-
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3252+
; CHECK-NEXT: vpmovdw %xmm0, %xmm0
32533253
; CHECK-NEXT: ret{{[l|q]}}
32543254
entry:
32553255
%0 = bitcast <2 x i64> %__A to <4 x i32>
@@ -3262,7 +3262,7 @@ entry:
32623262
define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
32633263
; CHECK-LABEL: test_mm_cvtepi64_epi8:
32643264
; CHECK: # %bb.0: # %entry
3265-
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3265+
; CHECK-NEXT: vpmovqb %xmm0, %xmm0
32663266
; CHECK-NEXT: ret{{[l|q]}}
32673267
entry:
32683268
%conv.i = trunc <2 x i64> %__A to <2 x i8>
@@ -3274,7 +3274,7 @@ entry:
32743274
define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
32753275
; CHECK-LABEL: test_mm_cvtepi64_epi16:
32763276
; CHECK: # %bb.0: # %entry
3277-
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3277+
; CHECK-NEXT: vpmovqw %xmm0, %xmm0
32783278
; CHECK-NEXT: ret{{[l|q]}}
32793279
entry:
32803280
%conv.i = trunc <2 x i64> %__A to <2 x i16>
@@ -3286,7 +3286,7 @@ entry:
32863286
define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
32873287
; CHECK-LABEL: test_mm_cvtepi64_epi32:
32883288
; CHECK: # %bb.0: # %entry
3289-
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
3289+
; CHECK-NEXT: vpmovqd %xmm0, %xmm0
32903290
; CHECK-NEXT: ret{{[l|q]}}
32913291
entry:
32923292
%conv.i = trunc <2 x i64> %__A to <2 x i32>

llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll

Lines changed: 81 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,33 @@ define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
3737
; AVX-NEXT: vmovq %xmm0, (%rsi)
3838
; AVX-NEXT: retq
3939
;
40-
; AVX512-LABEL: shuffle_v16i8_to_v8i8:
41-
; AVX512: # %bb.0:
42-
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
43-
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
44-
; AVX512-NEXT: vmovq %xmm0, (%rsi)
45-
; AVX512-NEXT: retq
40+
; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
41+
; AVX512F: # %bb.0:
42+
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
43+
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
44+
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
45+
; AVX512F-NEXT: retq
46+
;
47+
; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
48+
; AVX512VL: # %bb.0:
49+
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
50+
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
51+
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
52+
; AVX512VL-NEXT: retq
53+
;
54+
; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
55+
; AVX512BW: # %bb.0:
56+
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
57+
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
58+
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
59+
; AVX512BW-NEXT: retq
60+
;
61+
; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
62+
; AVX512BWVL: # %bb.0:
63+
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
64+
; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0
65+
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
66+
; AVX512BWVL-NEXT: retq
4667
%vec = load <16 x i8>, <16 x i8>* %L
4768
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
4869
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -286,12 +307,33 @@ define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
286307
; AVX-NEXT: vmovd %xmm0, (%rsi)
287308
; AVX-NEXT: retq
288309
;
289-
; AVX512-LABEL: shuffle_v16i8_to_v4i8:
290-
; AVX512: # %bb.0:
291-
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
292-
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
293-
; AVX512-NEXT: vmovd %xmm0, (%rsi)
294-
; AVX512-NEXT: retq
310+
; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
311+
; AVX512F: # %bb.0:
312+
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
313+
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
314+
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
315+
; AVX512F-NEXT: retq
316+
;
317+
; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
318+
; AVX512VL: # %bb.0:
319+
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
320+
; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
321+
; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
322+
; AVX512VL-NEXT: retq
323+
;
324+
; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
325+
; AVX512BW: # %bb.0:
326+
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
327+
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
328+
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
329+
; AVX512BW-NEXT: retq
330+
;
331+
; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
332+
; AVX512BWVL: # %bb.0:
333+
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
334+
; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
335+
; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
336+
; AVX512BWVL-NEXT: retq
295337
%vec = load <16 x i8>, <16 x i8>* %L
296338
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
297339
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -503,12 +545,33 @@ define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
503545
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
504546
; AVX-NEXT: retq
505547
;
506-
; AVX512-LABEL: shuffle_v16i8_to_v2i8:
507-
; AVX512: # %bb.0:
508-
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
509-
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
510-
; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
511-
; AVX512-NEXT: retq
548+
; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
549+
; AVX512F: # %bb.0:
550+
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
551+
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
552+
; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
553+
; AVX512F-NEXT: retq
554+
;
555+
; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
556+
; AVX512VL: # %bb.0:
557+
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
558+
; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0
559+
; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rsi)
560+
; AVX512VL-NEXT: retq
561+
;
562+
; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
563+
; AVX512BW: # %bb.0:
564+
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
565+
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
566+
; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
567+
; AVX512BW-NEXT: retq
568+
;
569+
; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
570+
; AVX512BWVL: # %bb.0:
571+
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
572+
; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0
573+
; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rsi)
574+
; AVX512BWVL-NEXT: retq
512575
%vec = load <16 x i8>, <16 x i8>* %L
513576
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
514577
store <2 x i8> %strided.vec, <2 x i8>* %S

0 commit comments

Comments
 (0)