Skip to content

Commit a780b94

Browse files
committed
[X86][SSE] Convert computeZeroableShuffleElements to emit KnownUndef and KnownZero
1 parent 1eb04d2 commit a780b94

File tree

1 file changed

+35
-23
lines changed

1 file changed

+35
-23
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 35 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6732,7 +6732,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
67326732
V1 = peekThroughBitcasts(V1);
67336733
V2 = peekThroughBitcasts(V2);
67346734

6735-
assert((VT.getSizeInBits() % Mask.size()) == 0 &&
6735+
assert((VT.getSizeInBits() % Size) == 0 &&
67366736
"Illegal split of shuffle value type");
67376737
unsigned EltSizeInBits = VT.getSizeInBits() / Size;
67386738

@@ -10423,24 +10423,31 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
1042310423
/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
1042410424
/// as many lanes with this technique as possible to simplify the remaining
1042510425
/// shuffle.
10426-
static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
10427-
SDValue V1, SDValue V2) {
10428-
APInt Zeroable(Mask.size(), 0);
10426+
static void computeZeroableShuffleElements(ArrayRef<int> Mask,
10427+
SDValue V1, SDValue V2,
10428+
APInt &KnownUndef, APInt &KnownZero) {
10429+
int Size = Mask.size();
10430+
KnownUndef = KnownZero = APInt::getNullValue(Size);
10431+
1042910432
V1 = peekThroughBitcasts(V1);
1043010433
V2 = peekThroughBitcasts(V2);
1043110434

1043210435
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
1043310436
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
1043410437

1043510438
int VectorSizeInBits = V1.getValueSizeInBits();
10436-
int ScalarSizeInBits = VectorSizeInBits / Mask.size();
10439+
int ScalarSizeInBits = VectorSizeInBits / Size;
1043710440
assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
1043810441

10439-
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10442+
for (int i = 0; i < Size; ++i) {
1044010443
int M = Mask[i];
1044110444
// Handle the easy cases.
10442-
if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
10443-
Zeroable.setBit(i);
10445+
if (M < 0) {
10446+
KnownUndef.setBit(i);
10447+
continue;
10448+
}
10449+
if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
10450+
KnownZero.setBit(i);
1044410451
continue;
1044510452
}
1044610453

@@ -10457,20 +10464,20 @@ static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
1045710464
if ((Size % V.getNumOperands()) == 0) {
1045810465
int Scale = Size / V->getNumOperands();
1045910466
SDValue Op = V.getOperand(M / Scale);
10460-
if (Op.isUndef() || X86::isZeroNode(Op))
10461-
Zeroable.setBit(i);
10467+
if (Op.isUndef())
10468+
KnownUndef.setBit(i);
10469+
if (X86::isZeroNode(Op))
10470+
KnownZero.setBit(i);
1046210471
else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1046310472
APInt Val = Cst->getAPIntValue();
10464-
Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
10465-
Val = Val.getLoBits(ScalarSizeInBits);
10473+
Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
1046610474
if (Val == 0)
10467-
Zeroable.setBit(i);
10475+
KnownZero.setBit(i);
1046810476
} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
1046910477
APInt Val = Cst->getValueAPF().bitcastToAPInt();
10470-
Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
10471-
Val = Val.getLoBits(ScalarSizeInBits);
10478+
Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
1047210479
if (Val == 0)
10473-
Zeroable.setBit(i);
10480+
KnownZero.setBit(i);
1047410481
}
1047510482
continue;
1047610483
}
@@ -10479,18 +10486,20 @@ static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
1047910486
// elements must be UNDEF or ZERO.
1048010487
if ((V.getNumOperands() % Size) == 0) {
1048110488
int Scale = V->getNumOperands() / Size;
10482-
bool AllZeroable = true;
10489+
bool AllUndef = true;
10490+
bool AllZero = true;
1048310491
for (int j = 0; j < Scale; ++j) {
1048410492
SDValue Op = V.getOperand((M * Scale) + j);
10485-
AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
10493+
AllUndef &= Op.isUndef();
10494+
AllZero &= X86::isZeroNode(Op);
1048610495
}
10487-
if (AllZeroable)
10488-
Zeroable.setBit(i);
10496+
if (AllUndef)
10497+
KnownUndef.setBit(i);
10498+
if (AllZero)
10499+
KnownZero.setBit(i);
1048910500
continue;
1049010501
}
1049110502
}
10492-
10493-
return Zeroable;
1049410503
}
1049510504

1049610505
// The Shuffle result is as follow:
@@ -17077,7 +17086,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
1707717086
// We actually see shuffles that are entirely re-arrangements of a set of
1707817087
// zero inputs. This mostly happens while decomposing complex shuffles into
1707917088
// simple ones. Directly lower these as a buildvector of zeros.
17080-
APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);
17089+
APInt KnownUndef, KnownZero;
17090+
computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17091+
17092+
APInt Zeroable = KnownUndef | KnownZero;
1708117093
if (Zeroable.isAllOnesValue())
1708217094
return getZeroVector(VT, Subtarget, DAG, DL);
1708317095

0 commit comments

Comments
 (0)