-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AVX512 pack intrinsics to be used in constexpr #156003
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…X512 pack intrinsics to be used in constexpr
@llvm/pr-subscribers-clang Author: woruyu (woruyu) ChangesSummaryThis PR resolves #154283 Patch is 29.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156003.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 527acd9ef086e..737be0f673e96 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -95,9 +95,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
let Features = "sse2" in {
def pavgb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def pavgw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
- def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
- def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
- def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
@@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
+ def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+ def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+ def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
}
let Features = "sse3" in {
@@ -314,7 +314,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
- def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -333,6 +332,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+ def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -568,10 +568,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
- def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
- def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -636,6 +632,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+
+ def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+ def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -1355,15 +1356,18 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
- def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
- def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
- def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
- def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+ def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+ def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+ def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
}
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a71cb8b0143be..8f649eca776ec 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11599,6 +11599,89 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
return false;
}
+enum class PackKind {
+ SSWB,
+ USWB,
+ SSDW,
+ USDW
+}; // 16→8 signed/unsigned; 32→16 signed/unsigned
+
+static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
+ PackKind K) {
+ APValue L, R;
+ if (!EvaluateAsRValue(Info, E->getArg(0), L) ||
+ !EvaluateAsRValue(Info, E->getArg(1), R))
+ return false;
+
+ unsigned SrcBits = (K == PackKind::SSWB || K == PackKind::USWB) ? 16 : 32;
+ unsigned DstBits = SrcBits / 2;
+
+ unsigned NL = L.getVectorLength();
+ unsigned NR = R.getVectorLength();
+ if (NL == 0 || NR == 0 || NL != NR)
+ return false;
+
+ // Bounds for saturation (extended to SrcBits for compares).
+ APInt Lo = (K == PackKind::USWB || K == PackKind::USDW)
+ ? APInt(SrcBits, 0)
+ : APInt::getSignedMinValue(DstBits).sext(SrcBits);
+ APInt Hi = (K == PackKind::USWB || K == PackKind::USDW)
+ ? APInt::getMaxValue(DstBits).zext(SrcBits)
+ : APInt::getSignedMaxValue(DstBits).sext(SrcBits);
+
+ // Result element signedness follows the builtin's return vector element type.
+ QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ bool DestIsUnsigned = DestEltTy->isUnsignedIntegerType();
+
+ // Clamp one source element to the target range and narrow to DstBits.
+ auto clampOne = [&](const APSInt &X) -> APSInt {
+ APInt V = X;
+ if (V.getBitWidth() != SrcBits)
+ V = V.sextOrTrunc(SrcBits);
+
+ if (K == PackKind::USWB || K == PackKind::USDW) {
+ if (V.isNegative())
+ V = Lo;
+ else if (V.ugt(Hi))
+ V = Hi;
+ APInt Narrow = V.zextOrTrunc(DstBits);
+ return APSInt(Narrow, /*isUnsigned=*/true);
+ } else {
+ if (V.sgt(Hi))
+ V = Hi;
+ else if (V.slt(Lo))
+ V = Lo;
+ APInt Narrow = V.sextOrTrunc(DstBits);
+ return APSInt(Narrow, /*isUnsigned=*/DestIsUnsigned);
+ }
+ };
+
+ SmallVector<APValue, 64> Out;
+ Out.reserve(NL + NR);
+
+ // Process per 128-bit lane (MMX 64-bit uses a single lane).
+ unsigned VectorBits = NL * SrcBits;
+ unsigned srcPerLane, lanes;
+ if (VectorBits >= 128) {
+ srcPerLane = 128 / SrcBits; // 8 (16→8) or 4 (32→16)
+ lanes = VectorBits / 128; // 1 (128b), 2 (256b), 4 (512b)
+ } else {
+ srcPerLane = NL; // MMX
+ lanes = 1;
+ }
+
+ for (unsigned lane = 0; lane != lanes; ++lane) {
+ unsigned base = lane * srcPerLane;
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.push_back(APValue(clampOne(L.getVectorElt(base + i).getInt())));
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.push_back(APValue(clampOne(R.getVectorElt(base + i).getInt())));
+ }
+
+ Result = APValue(Out.data(), Out.size());
+ return true;
+}
+
bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (!IsConstantEvaluatedBuiltinCall(E))
return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -11752,6 +11835,22 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_packsswb128:
+ case X86::BI__builtin_ia32_packsswb256:
+ case X86::BI__builtin_ia32_packsswb512:
+ return evalPackBuiltin(E, Info, Result, PackKind::SSWB);
+ case X86::BI__builtin_ia32_packuswb128:
+ case X86::BI__builtin_ia32_packuswb256:
+ case X86::BI__builtin_ia32_packuswb512:
+ return evalPackBuiltin(E, Info, Result, PackKind::USWB);
+ case X86::BI__builtin_ia32_packssdw128:
+ case X86::BI__builtin_ia32_packssdw256:
+ case X86::BI__builtin_ia32_packssdw512:
+ return evalPackBuiltin(E, Info, Result, PackKind::SSDW);
+ case X86::BI__builtin_ia32_packusdw128:
+ case X86::BI__builtin_ia32_packusdw256:
+ case X86::BI__builtin_ia32_packusdw512:
+ return evalPackBuiltin(E, Info, Result, PackKind::USDW);
case clang::X86::BI__builtin_ia32_pmuldq128:
case clang::X86::BI__builtin_ia32_pmuldq256:
case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index ce5b2b7544d8c..22d7157f0db58 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -177,9 +177,8 @@ _mm256_abs_epi32(__m256i __a)
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -209,9 +208,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi32(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
}
@@ -240,9 +238,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b)
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -272,9 +269,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
}
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 9263f7af3ee2f..b56f53107fa92 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -525,9 +525,8 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
}
@@ -547,9 +546,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
}
@@ -569,9 +567,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
(__v64qi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
}
@@ -591,9 +588,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
}
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 8b6b62458dac1..f3c9eb7158c1f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4166,8 +4166,8 @@ void _mm_mfence(void);
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
}
@@ -4189,8 +4189,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
/// are written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi32(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
}
@@ -4212,8 +4212,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
}
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 6fe9d67b8976d..aa6845f60ee99 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -162,11 +162,10 @@ _mm_cvtm64_si64(__m64 __m)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packsswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packsswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -188,11 +187,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packssdw128(
- (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi32(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packssdw128(
+ (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
}
/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -214,11 +212,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packuswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pu16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packuswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 57d0d329312af..8acf4929d1125 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1475,8 +1475,8 @@ _mm_cvtepu32_epi64(__m128i __V) {
/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
- __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi32(__m128i __V1, __m128i __V2) {
return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
}
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 29cb3e8860be9..d2d8b53c24a96 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: woruyu (woruyu) ChangesSummaryThis PR resolves #154283 Patch is 29.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156003.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 527acd9ef086e..737be0f673e96 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -95,9 +95,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
let Features = "sse2" in {
def pavgb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def pavgw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
- def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
- def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
- def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
@@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
+ def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+ def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+ def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
}
let Features = "sse3" in {
@@ -314,7 +314,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
- def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -333,6 +332,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+ def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -568,10 +568,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
- def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
- def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -636,6 +632,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+
+ def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+ def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -1355,15 +1356,18 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
- def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
- def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
- def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
- def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+ def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+ def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+ def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
}
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a71cb8b0143be..8f649eca776ec 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11599,6 +11599,89 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
return false;
}
+enum class PackKind {
+ SSWB,
+ USWB,
+ SSDW,
+ USDW
+}; // 16→8 signed/unsigned; 32→16 signed/unsigned
+
+static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
+ PackKind K) {
+ APValue L, R;
+ if (!EvaluateAsRValue(Info, E->getArg(0), L) ||
+ !EvaluateAsRValue(Info, E->getArg(1), R))
+ return false;
+
+ unsigned SrcBits = (K == PackKind::SSWB || K == PackKind::USWB) ? 16 : 32;
+ unsigned DstBits = SrcBits / 2;
+
+ unsigned NL = L.getVectorLength();
+ unsigned NR = R.getVectorLength();
+ if (NL == 0 || NR == 0 || NL != NR)
+ return false;
+
+ // Bounds for saturation (extended to SrcBits for compares).
+ APInt Lo = (K == PackKind::USWB || K == PackKind::USDW)
+ ? APInt(SrcBits, 0)
+ : APInt::getSignedMinValue(DstBits).sext(SrcBits);
+ APInt Hi = (K == PackKind::USWB || K == PackKind::USDW)
+ ? APInt::getMaxValue(DstBits).zext(SrcBits)
+ : APInt::getSignedMaxValue(DstBits).sext(SrcBits);
+
+ // Result element signedness follows the builtin's return vector element type.
+ QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ bool DestIsUnsigned = DestEltTy->isUnsignedIntegerType();
+
+ // Clamp one source element to the target range and narrow to DstBits.
+ auto clampOne = [&](const APSInt &X) -> APSInt {
+ APInt V = X;
+ if (V.getBitWidth() != SrcBits)
+ V = V.sextOrTrunc(SrcBits);
+
+ if (K == PackKind::USWB || K == PackKind::USDW) {
+ if (V.isNegative())
+ V = Lo;
+ else if (V.ugt(Hi))
+ V = Hi;
+ APInt Narrow = V.zextOrTrunc(DstBits);
+ return APSInt(Narrow, /*isUnsigned=*/true);
+ } else {
+ if (V.sgt(Hi))
+ V = Hi;
+ else if (V.slt(Lo))
+ V = Lo;
+ APInt Narrow = V.sextOrTrunc(DstBits);
+ return APSInt(Narrow, /*isUnsigned=*/DestIsUnsigned);
+ }
+ };
+
+ SmallVector<APValue, 64> Out;
+ Out.reserve(NL + NR);
+
+ // Process per 128-bit lane (MMX 64-bit uses a single lane).
+ unsigned VectorBits = NL * SrcBits;
+ unsigned srcPerLane, lanes;
+ if (VectorBits >= 128) {
+ srcPerLane = 128 / SrcBits; // 8 (16→8) or 4 (32→16)
+ lanes = VectorBits / 128; // 1 (128b), 2 (256b), 4 (512b)
+ } else {
+ srcPerLane = NL; // MMX
+ lanes = 1;
+ }
+
+ for (unsigned lane = 0; lane != lanes; ++lane) {
+ unsigned base = lane * srcPerLane;
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.push_back(APValue(clampOne(L.getVectorElt(base + i).getInt())));
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.push_back(APValue(clampOne(R.getVectorElt(base + i).getInt())));
+ }
+
+ Result = APValue(Out.data(), Out.size());
+ return true;
+}
+
bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (!IsConstantEvaluatedBuiltinCall(E))
return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -11752,6 +11835,22 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_packsswb128:
+ case X86::BI__builtin_ia32_packsswb256:
+ case X86::BI__builtin_ia32_packsswb512:
+ return evalPackBuiltin(E, Info, Result, PackKind::SSWB);
+ case X86::BI__builtin_ia32_packuswb128:
+ case X86::BI__builtin_ia32_packuswb256:
+ case X86::BI__builtin_ia32_packuswb512:
+ return evalPackBuiltin(E, Info, Result, PackKind::USWB);
+ case X86::BI__builtin_ia32_packssdw128:
+ case X86::BI__builtin_ia32_packssdw256:
+ case X86::BI__builtin_ia32_packssdw512:
+ return evalPackBuiltin(E, Info, Result, PackKind::SSDW);
+ case X86::BI__builtin_ia32_packusdw128:
+ case X86::BI__builtin_ia32_packusdw256:
+ case X86::BI__builtin_ia32_packusdw512:
+ return evalPackBuiltin(E, Info, Result, PackKind::USDW);
case clang::X86::BI__builtin_ia32_pmuldq128:
case clang::X86::BI__builtin_ia32_pmuldq256:
case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index ce5b2b7544d8c..22d7157f0db58 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -177,9 +177,8 @@ _mm256_abs_epi32(__m256i __a)
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -209,9 +208,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi32(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
}
@@ -240,9 +238,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b)
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -272,9 +269,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
}
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 9263f7af3ee2f..b56f53107fa92 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -525,9 +525,8 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
}
@@ -547,9 +546,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
}
@@ -569,9 +567,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
(__v64qi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
}
@@ -591,9 +588,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
}
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 8b6b62458dac1..f3c9eb7158c1f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4166,8 +4166,8 @@ void _mm_mfence(void);
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
}
@@ -4189,8 +4189,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
/// are written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi32(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
}
@@ -4212,8 +4212,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
}
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 6fe9d67b8976d..aa6845f60ee99 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -162,11 +162,10 @@ _mm_cvtm64_si64(__m64 __m)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packsswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packsswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -188,11 +187,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packssdw128(
- (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi32(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packssdw128(
+ (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
}
/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -214,11 +212,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packuswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pu16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packuswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 57d0d329312af..8acf4929d1125 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1475,8 +1475,8 @@ _mm_cvtepu32_epi64(__m128i __V) {
/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
- __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi32(__m128i __V1, __m128i __V2) {
return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
}
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 29cb3e8860be9..d2d8b53c24a96 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@@ -11599,6 +11599,89 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO, | |||
return false; | |||
} | |||
|
|||
enum class PackKind { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pack is such an overloaded word in C++, can we get something more specific, I don't have a good suggestion, I am going to throw out SSEAVXPackKind as a strawman but please find something better if you can.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm hoping we can lose this enum entirely and just use a callback
: APInt::getSignedMaxValue(DstBits).sext(SrcBits); | ||
|
||
// Result element signedness follows the builtin's return vector element type. | ||
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we assert that E->getType()
is actually a Vector or is that verified before we get here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These target builtins are already constrained to specific types - the assertion inside castAs should be enough
|
||
unsigned NL = L.getVectorLength(); | ||
unsigned NR = R.getVectorLength(); | ||
if (NL == 0 || NR == 0 || NL != NR) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have a test that covers these cases?
if (V.getBitWidth() != SrcBits) | ||
V = V.sextOrTrunc(SrcBits); | ||
|
||
if (K == PackKind::USWB || K == PackKind::USDW) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please make sure to verify that you cover each branch, I see a lot of combination but I am sure we cover them all in the tests.
Summary
This PR resolves #154283