mwerschy
diff --git a/‎llvm/include/llvm/CodeGen/TargetLowering.h
Lines changed: 4 additions & 0 deletions b/‎llvm/include/llvm/CodeGen/TargetLowering.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Lines changed: 53 additions & 2 deletions b/‎llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Lines changed: 53 additions & 2 deletions
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Lines changed: 12 additions & 0 deletions b/‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Lines changed: 12 additions & 0 deletions
diff --git a/‎llvm/lib/Target/X86/X86ISelLowering.cpp
Lines changed: 12 additions & 4 deletions b/‎llvm/lib/Target/X86/X86ISelLowering.cpp
Lines changed: 12 additions & 4 deletions
diff --git a/‎llvm/lib/Target/X86/X86ISelLowering.h
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/X86/X86ISelLowering.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/X86/avx512-vec-cmp.ll
Lines changed: 2 additions & 3 deletions b/‎llvm/test/CodeGen/X86/avx512-vec-cmp.ll
Lines changed: 2 additions & 3 deletions
diff --git a/‎llvm/test/CodeGen/X86/bitreverse.ll
Lines changed: 6 additions & 12 deletions b/‎llvm/test/CodeGen/X86/bitreverse.ll
Lines changed: 6 additions & 12 deletions
diff --git a/‎llvm/test/CodeGen/X86/combine-bitreverse.ll
Lines changed: 6 additions & 12 deletions b/‎llvm/test/CodeGen/X86/combine-bitreverse.ll
Lines changed: 6 additions & 12 deletions
diff --git a/‎llvm/test/CodeGen/X86/combine-shl.ll
Lines changed: 3 additions & 8 deletions b/‎llvm/test/CodeGen/X86/combine-shl.ll
Lines changed: 3 additions & 8 deletions
@@ -3119,6 +3119,10 @@ class TargetLowering : public TargetLoweringBase {
                                                  TargetLoweringOpt &TLO,
                                                  unsigned Depth = 0) const;
 
+  /// This method returns the constant pool value that will be loaded by LD.
+  /// NOTE: You must check for implicit extensions of the constant by LD.
+  virtual const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const;
+
   /// If \p SNaN is false, \returns true if \p Op is known to never be any
   /// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling
   /// NaN.
 
@@ -10110,7 +10110,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
   if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
-      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) &&
       TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
     SDValue Amt = N0.getOperand(1);
     KnownBits Known = DAG.computeKnownBits(Amt);
 
@@ -2886,8 +2886,59 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   }
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(Op);
-    // If this is a ZEXTLoad and we are looking at the loaded value.
-    if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+    const Constant *Cst = TLI->getTargetConstantFromLoad(LD);
+    if (ISD::isNON_EXTLoad(LD) && Cst) {
+      // Determine any common known bits from the loaded constant pool value.
+      Type *CstTy = Cst->getType();
+      if ((NumElts * BitWidth) == CstTy->getPrimitiveSizeInBits()) {
+        // If its a vector splat, then we can (quickly) reuse the scalar path.
+        // NOTE: We assume all elements match and none are UNDEF.
+        if (CstTy->isVectorTy()) {
+          if (const Constant *Splat = Cst->getSplatValue()) {
+            Cst = Splat;
+            CstTy = Cst->getType();
+          }
+        }
+        // TODO - do we need to handle different bitwidths?
+        if (CstTy->isVectorTy() && BitWidth == CstTy->getScalarSizeInBits()) {
+          // Iterate across all vector elements finding common known bits.
+          Known.One.setAllBits();
+          Known.Zero.setAllBits();
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (!DemandedElts[i])
+              continue;
+            if (Constant *Elt = Cst->getAggregateElement(i)) {
+              if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
+                const APInt &Value = CInt->getValue();
+                Known.One &= Value;
+                Known.Zero &= ~Value;
+                continue;
+              }
+              if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
+                APInt Value = CFP->getValueAPF().bitcastToAPInt();
+                Known.One &= Value;
+                Known.Zero &= ~Value;
+                continue;
+              }
+            }
+            Known.One.clearAllBits();
+            Known.Zero.clearAllBits();
+            break;
+          }
+        } else if (BitWidth == CstTy->getPrimitiveSizeInBits()) {
+          if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
+            const APInt &Value = CInt->getValue();
+            Known.One = Value;
+            Known.Zero = ~Value;
+          } else if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
+            APInt Value = CFP->getValueAPF().bitcastToAPInt();
+            Known.One = Value;
+            Known.Zero = ~Value;
+          }
+        }
+      }
+    } else if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+      // If this is a ZEXTLoad and we are looking at the loaded value.
       EVT VT = LD->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
       Known.Zero.setBitsFrom(MemBits);
 
@@ -659,6 +659,14 @@ bool TargetLowering::SimplifyDemandedBits(
       Known.Zero &= Known2.Zero;
     }
     return false; // Don't fall through, will infinitely loop.
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    if (getTargetConstantFromLoad(LD)) {
+      Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+      return false; // Don't fall through, will infinitely loop.
+    }
+    break;
+  }
   case ISD::INSERT_VECTOR_ELT: {
     SDValue Vec = Op.getOperand(0);
     SDValue Scl = Op.getOperand(1);
@@ -2314,6 +2322,10 @@ bool TargetLowering::SimplifyDemandedBitsForTargetNode(
   return false;
 }
 
+const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const {
+  return nullptr;
+}
+
 bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
                                                   const SelectionDAG &DAG,
                                                   bool SNaN,
 
@@ -5731,10 +5731,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
 }
 
-static const Constant *getTargetConstantFromNode(SDValue Op) {
-  Op = peekThroughBitcasts(Op);
-
-  auto *Load = dyn_cast<LoadSDNode>(Op);
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
   if (!Load)
     return nullptr;
 
@@ -5750,6 +5747,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
   return CNode->getConstVal();
 }
 
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+  Op = peekThroughBitcasts(Op);
+  return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
+}
+
+const Constant *
+X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
+  assert(LD && "Unexpected null LoadSDNode");
+  return getTargetConstantFromNode(LD);
+}
+
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
                                           APInt &UndefElts,
 
@@ -908,6 +908,8 @@ namespace llvm {
                                            TargetLoweringOpt &TLO,
                                            unsigned Depth) const override;
 
+    const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
+
     SDValue unwrapAddress(SDValue N) const override;
 
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
@@ -940,9 +940,8 @@ define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 {
 ; AVX512-LABEL: test46:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc1]
-; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
+; AVX512-NEXT:    ## xmm0 = xmm0[0,1,1,3]
 ; AVX512-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A]
 ; AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI47_0-4, kind: reloc_riprel_4byte
 ; AVX512-NEXT:    retq ## encoding: [0xc3]
 
@@ -61,31 +61,25 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
-; X64-NEXT:    packuswb %xmm2, %xmm1
-; X64-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X64-NEXT:    movdqa %xmm1, %xmm2
-; X64-NEXT:    pand %xmm0, %xmm2
-; X64-NEXT:    psllw $4, %xmm2
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; X64-NEXT:    packuswb %xmm2, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psllw $4, %xmm1
 ; X64-NEXT:    pand {{.*}}(%rip), %xmm1
-; X64-NEXT:    psrlw $4, %xmm1
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    pandn %xmm2, %xmm0
+; X64-NEXT:    psrlw $4, %xmm0
+; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; X64-NEXT:    pand %xmm0, %xmm1
 ; X64-NEXT:    psllw $2, %xmm1
-; X64-NEXT:    pand {{.*}}(%rip), %xmm1
 ; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    psrlw $2, %xmm0
-; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; X64-NEXT:    pand %xmm0, %xmm1
 ; X64-NEXT:    paddb %xmm1, %xmm1
 ; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    psrlw $1, %xmm0
-; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    psrlq $48, %xmm0
 ; X64-NEXT:    retq
 
@@ -47,31 +47,25 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
 ; X86-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; X86-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
-; X86-NEXT:    packuswb %xmm2, %xmm1
-; X86-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X86-NEXT:    movdqa %xmm1, %xmm2
-; X86-NEXT:    pand %xmm0, %xmm2
-; X86-NEXT:    psllw $4, %xmm2
+; X86-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; X86-NEXT:    packuswb %xmm2, %xmm0
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psllw $4, %xmm1
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm1
-; X86-NEXT:    psrlw $4, %xmm1
-; X86-NEXT:    pand %xmm0, %xmm1
-; X86-NEXT:    pandn %xmm2, %xmm0
+; X86-NEXT:    psrlw $4, %xmm0
+; X86-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
 ; X86-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; X86-NEXT:    pand %xmm0, %xmm1
 ; X86-NEXT:    psllw $2, %xmm1
-; X86-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    psrlw $2, %xmm0
-; X86-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
 ; X86-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; X86-NEXT:    pand %xmm0, %xmm1
 ; X86-NEXT:    paddb %xmm1, %xmm1
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    psrlw $1, %xmm0
-; X86-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    retl
 
@@ -268,16 +268,11 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
 ; SSE2-LABEL: combine_vec_shl_ext_shl1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pslld $29, %xmm2
-; SSE2-NEXT:    pslld $28, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE2-NEXT:    pslld $30, %xmm0
-; SSE2-NEXT:    xorpd %xmm2, %xmm2
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm1[0,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_shl_ext_shl1: