Skip to content

Commit d9e174d

Browse files
committed
[X86][SSE] getFauxShuffle - account for PEXTW/PEXTB implicit zero-extension
The insert(truncate/extend(extract(vec0,c0)),vec1,c1) case in rGacbc5ede99 wasn't combining the 'mineltsize' with the src vector elt size which may be smaller due to implicit extension during extraction. Reduced from test case provided by @mstorsjo
1 parent 0f16d66 commit d9e174d

File tree

2 files changed

+93
-2
lines changed

2 files changed

+93
-2
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7469,8 +7469,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
74697469
Scl.getOpcode() == ISD::ANY_EXTEND ||
74707470
Scl.getOpcode() == ISD::ZERO_EXTEND) {
74717471
Scl = Scl.getOperand(0);
7472-
if (MinBitsPerElt > Scl.getScalarValueSizeInBits())
7473-
MinBitsPerElt = Scl.getScalarValueSizeInBits();
7472+
MinBitsPerElt =
7473+
std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
74747474
}
74757475
if ((MinBitsPerElt % 8) != 0)
74767476
return false;
@@ -7493,6 +7493,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
74937493
unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
74947494
unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
74957495
unsigned DstByte = DstIdx * NumBytesPerElt;
7496+
MinBitsPerElt =
7497+
std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
74967498

74977499
// Create 'identity' byte level shuffle mask and then add inserted bytes.
74987500
if (Opcode == ISD::SCALAR_TO_VECTOR) {

llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2966,6 +2966,95 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa
29662966
ret <8 x i16> %7
29672967
}
29682968

2969+
define <8 x i16> @shuffle_scalar_to_vector_extract(<8 x i8>* %p0, i8* %p1, i8* %p2) {
2970+
; SSE2-LABEL: shuffle_scalar_to_vector_extract:
2971+
; SSE2: # %bb.0:
2972+
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2973+
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2974+
; SSE2-NEXT: psraw $8, %xmm1
2975+
; SSE2-NEXT: pextrw $7, %xmm1, %eax
2976+
; SSE2-NEXT: movd %eax, %xmm2
2977+
; SSE2-NEXT: movsbl (%rsi), %eax
2978+
; SSE2-NEXT: movd %eax, %xmm0
2979+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2980+
; SSE2-NEXT: movsbl (%rdx), %eax
2981+
; SSE2-NEXT: movd %eax, %xmm0
2982+
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2983+
; SSE2-NEXT: pxor %xmm0, %xmm0
2984+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2985+
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2986+
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2987+
; SSE2-NEXT: retq
2988+
;
2989+
; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
2990+
; SSSE3: # %bb.0:
2991+
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2992+
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2993+
; SSSE3-NEXT: psraw $8, %xmm1
2994+
; SSSE3-NEXT: pextrw $7, %xmm1, %eax
2995+
; SSSE3-NEXT: movd %eax, %xmm2
2996+
; SSSE3-NEXT: movsbl (%rsi), %eax
2997+
; SSSE3-NEXT: movd %eax, %xmm0
2998+
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2999+
; SSSE3-NEXT: movsbl (%rdx), %eax
3000+
; SSSE3-NEXT: movd %eax, %xmm0
3001+
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3002+
; SSSE3-NEXT: pxor %xmm0, %xmm0
3003+
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3004+
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3005+
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3006+
; SSSE3-NEXT: retq
3007+
;
3008+
; SSE41-LABEL: shuffle_scalar_to_vector_extract:
3009+
; SSE41: # %bb.0:
3010+
; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
3011+
; SSE41-NEXT: pextrw $4, %xmm0, %eax
3012+
; SSE41-NEXT: pextrw $7, %xmm0, %ecx
3013+
; SSE41-NEXT: pxor %xmm0, %xmm0
3014+
; SSE41-NEXT: pinsrw $1, %eax, %xmm0
3015+
; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB
3016+
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
3017+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
3018+
; SSE41-NEXT: movsbl (%rsi), %eax
3019+
; SSE41-NEXT: pinsrw $5, %eax, %xmm0
3020+
; SSE41-NEXT: movsbl (%rdx), %eax
3021+
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
3022+
; SSE41-NEXT: retq
3023+
;
3024+
; AVX-LABEL: shuffle_scalar_to_vector_extract:
3025+
; AVX: # %bb.0:
3026+
; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
3027+
; AVX-NEXT: vpextrw $4, %xmm0, %eax
3028+
; AVX-NEXT: vpextrw $7, %xmm0, %ecx
3029+
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
3030+
; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
3031+
; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB
3032+
; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
3033+
; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
3034+
; AVX-NEXT: movsbl (%rsi), %eax
3035+
; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
3036+
; AVX-NEXT: movsbl (%rdx), %eax
3037+
; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
3038+
; AVX-NEXT: retq
3039+
%tmp = load <8 x i8>, <8 x i8>* %p0, align 1
3040+
%tmp1 = sext <8 x i8> %tmp to <8 x i16>
3041+
%tmp2 = load i8, i8* %p1, align 1
3042+
%cvt1 = sext i8 %tmp2 to i16
3043+
%tmp3 = load i8, i8* %p2, align 1
3044+
%cvt2 = sext i8 %tmp3 to i16
3045+
%tmp4 = extractelement <8 x i16> %tmp1, i32 4
3046+
%tmp5 = extractelement <8 x i16> %tmp1, i32 7
3047+
%tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0
3048+
%tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1
3049+
%tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3
3050+
%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4
3051+
%tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5
3052+
%tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6
3053+
%tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7
3054+
%tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
3055+
ret <8 x i16> %tmp13
3056+
}
3057+
29693058
define void @PR43024() {
29703059
; SSE2-LABEL: PR43024:
29713060
; SSE2: # %bb.0:

0 commit comments

Comments
 (0)