[ARM] Basic gather scatter cost model

davemgreen · davemgreen · commit e9c198278e21 · 2020-01-22T14:41:38.000Z
This is a very basic MVE gather/scatter cost model, based roughly on the code that we will currently produce. It does not handle truncating scatters or extending gathers correctly yet, as it is difficult to tell that they are going to be correctly extended/truncated from the limited information in the cost function. This can be improved as we extend support for these in the future. Based on code originally written by David Sherwood. Differential Revision: https://reviews.llvm.org/D73021
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -860,6 +860,67 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
                                            UseMaskForCond, UseMaskForGaps);
 }
 
+unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                            Value *Ptr, bool VariableMask,
+                                            unsigned Alignment) {
+  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
+    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                         Alignment);
+
+  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
+  VectorType *VTy = cast<VectorType>(DataTy);
+
+  // TODO: Splitting, once we do that.
+  // TODO: trunc/sext/zext the result/input
+
+  unsigned NumElems = VTy->getNumElements();
+  unsigned EltSize = VTy->getScalarSizeInBits();
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
+
+  // For now, it is assumed that for the MVE gather instructions the loads are
+  // all effectively serialised. This means the cost is the scalar cost
+  // multiplied by the number of elements being loaded. This is possibly very
+  // conservative, but even so we still end up vectorising loops because the
+  // cost per iteration for many loops is lower than for scalar loops.
+  unsigned VectorCost = NumElems * LT.first;
+  // The scalarization cost should be a lot higher. We use the number of vector
+  // elements plus the scalarization overhead.
+  unsigned ScalarCost =
+      NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {});
+
+  // TODO: Cost extended gathers or trunc stores correctly.
+  if (EltSize * NumElems != 128 || NumElems < 4)
+    return ScalarCost;
+  if (Alignment < EltSize / 8)
+    return ScalarCost;
+
+  // Any (aligned) i32 gather will not need to be scalarised.
+  if (EltSize == 32)
+    return VectorCost;
+  // For smaller types, we need to ensure that the gep's inputs are correctly
+  // extended from a small enough value. Other size (including i64) are
+  // scalarized for now.
+  if (EltSize != 8 && EltSize != 16)
+    return ScalarCost;
+
+  if (auto BC = dyn_cast<BitCastInst>(Ptr))
+    Ptr = BC->getOperand(0);
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+    if (GEP->getNumOperands() != 2)
+      return ScalarCost;
+    unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
+    // Scale needs to be correct (which is only relevant for i16s).
+    if (Scale != 1 && Scale * 8 != EltSize)
+      return ScalarCost;
+    // And we need to zext (not sext) the indexes from a small enough type.
+    if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1)))
+      if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= EltSize)
+        return VectorCost;
+    return ScalarCost;
+  }
+  return ScalarCost;
+}
+
 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
   if (!F->isIntrinsic())
     BaseT::isLoweredToCall(F);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -202,6 +202,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
 
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                                  bool VariableMask, unsigned Alignment);
+
   bool isLoweredToCall(const Function *F);
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                 AssumptionCache &AC,
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll