-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[X86] Allow AVX2 per-element shift intrinsics to be used in constexpr #154780
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This handles constant folding for the AVX2 per-element shift intrinsics, which handle out of bounds shift amounts (logical result = 0, arithmetic result = signbit splat) AVX512 intrinsics will follow in follow up patches First stage of llvm#154287
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThis handles constant folding for the AVX2 per-element shift intrinsics, which handle out of bounds shift amounts (logical result = 0, arithmetic result = signbit splat) AVX512 intrinsics will follow in follow up patches First stage of #154287 Patch is 20.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154780.diff 4 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cc1da937455a2..527acd9ef086e 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -627,11 +627,23 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+
+ def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+ def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+ def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+ def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+ def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+ def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+ def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+ def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+ def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+ def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
@@ -654,46 +666,6 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
}
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
}
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a03e64fcffde2..9b934753bcc3c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11669,13 +11669,24 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case clang::X86::BI__builtin_ia32_pmulhuw512:
case clang::X86::BI__builtin_ia32_pmulhw128:
case clang::X86::BI__builtin_ia32_pmulhw256:
- case clang::X86::BI__builtin_ia32_pmulhw512: {
+ case clang::X86::BI__builtin_ia32_pmulhw512:
+ case clang::X86::BI__builtin_ia32_psllv2di:
+ case clang::X86::BI__builtin_ia32_psllv4di:
+ case clang::X86::BI__builtin_ia32_psllv4si:
+ case clang::X86::BI__builtin_ia32_psllv8si:
+ case clang::X86::BI__builtin_ia32_psrav4si:
+ case clang::X86::BI__builtin_ia32_psrav8si:
+ case clang::X86::BI__builtin_ia32_psrlv2di:
+ case clang::X86::BI__builtin_ia32_psrlv4di:
+ case clang::X86::BI__builtin_ia32_psrlv4si:
+ case clang::X86::BI__builtin_ia32_psrlv8si:{
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
unsigned SourceLen = SourceLHS.getVectorLength();
SmallVector<APValue, 4> ResultElements;
ResultElements.reserve(SourceLen);
@@ -11687,12 +11698,12 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case Builtin::BI__builtin_elementwise_add_sat:
ResultElements.push_back(APValue(
APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS),
- DestEltTy->isUnsignedIntegerOrEnumerationType())));
+ DestUnsigned)));
break;
case Builtin::BI__builtin_elementwise_sub_sat:
ResultElements.push_back(APValue(
APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS),
- DestEltTy->isUnsignedIntegerOrEnumerationType())));
+ DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
@@ -11706,6 +11717,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhs(LHS, RHS),
/*isUnsigned=*/false)));
break;
+ case clang::X86::BI__builtin_ia32_psllv2di:
+ case clang::X86::BI__builtin_ia32_psllv4di:
+ case clang::X86::BI__builtin_ia32_psllv4si:
+ case clang::X86::BI__builtin_ia32_psllv8si:
+ if (RHS.uge(RHS.getBitWidth())) {
+ ResultElements.push_back(
+ APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
+ break;
+ }
+ ResultElements.push_back(
+ APValue(APSInt(LHS.shl(RHS.getZExtValue()), DestUnsigned)));
+ break;
+ case clang::X86::BI__builtin_ia32_psrav4si:
+ case clang::X86::BI__builtin_ia32_psrav8si:
+ if (RHS.uge(RHS.getBitWidth())) {
+ ResultElements.push_back(
+ APValue(APSInt(LHS.ashr(RHS.getBitWidth() - 1), DestUnsigned)));
+ break;
+ }
+ ResultElements.push_back(
+ APValue(APSInt(LHS.ashr(RHS.getZExtValue()), DestUnsigned)));
+ break;
+ case clang::X86::BI__builtin_ia32_psrlv2di:
+ case clang::X86::BI__builtin_ia32_psrlv4di:
+ case clang::X86::BI__builtin_ia32_psrlv4si:
+ case clang::X86::BI__builtin_ia32_psrlv8si:
+ if (RHS.uge(RHS.getBitWidth())) {
+ ResultElements.push_back(
+ APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
+ break;
+ }
+ ResultElements.push_back(
+ APValue(APSInt(LHS.lshr(RHS.getZExtValue()), DestUnsigned)));
+ break;
}
}
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index c7e1c4446e85d..ce5b2b7544d8c 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -3721,7 +3721,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_sllv_epi32(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
@@ -3743,7 +3743,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y)
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_sllv_epi32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
@@ -3765,7 +3765,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y)
/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_sllv_epi64(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
@@ -3787,7 +3787,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y)
/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_sllv_epi64(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
@@ -3810,7 +3810,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y)
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_srav_epi32(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
@@ -3833,7 +3833,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y)
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_srav_epi32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
@@ -3855,7 +3855,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y)
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_srlv_epi32(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
@@ -3877,7 +3877,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y)
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_srlv_epi32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
@@ -3899,7 +3899,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y)
/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_srlv_epi64(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
@@ -3921,7 +3921,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y)
/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_srlv_epi64(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 5b252fa315ef8..29cb3e8860be9 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -327,7 +327,6 @@ __m256i test_mm256_cvtepi8_epi16(__m128i a) {
// CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
return _mm256_cvtepi8_epi16(a);
}
-
TEST_CONSTEXPR(match_v16hi(_mm256_cvtepi8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12));
__m256i test_mm256_cvtepi8_epi32(__m128i a) {
@@ -336,7 +335,6 @@ __m256i test_mm256_cvtepi8_epi32(__m128i a) {
// CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
return _mm256_cvtepi8_epi32(a);
}
-
TEST_CONSTEXPR(match_v8si(_mm256_cvtepi8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4));
__m256i test_mm256_cvtepi8_epi64(__m128i a) {
@@ -345,7 +343,6 @@ __m256i test_mm256_cvtepi8_epi64(__m128i a) {
// CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
return _mm256_cvtepi8_epi64(a);
}
-
TEST_CONSTEXPR(match_v4di(_mm256_cvtepi8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0));
__m256i test_mm256_cvtepi16_epi32(__m128i a) {
@@ -353,7 +350,6 @@ __m256i test_mm256_cvtepi16_epi32(__m128i a) {
// CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
return _mm256_cvtepi16_epi32(a);
}
-
TEST_CONSTEXPR(match_v8si(_mm256_cvtepi16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0, 1, -2, 3, -4));
__m256i test_mm256_cvtepi16_epi64(__m128i a) {
@@ -362,7 +358,6 @@ __m256i test_mm256_cvtepi16_epi64(__m128i a) {
// CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
return _mm256_cvtepi16_epi64(a);
}
-
TEST_CONSTEXPR(match_v4di(_mm256_cvtepi16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0));
__m256i test_mm256_cvtepi32_epi64(__m128i a) {
@@ -370,7 +365,6 @@ __m256i test_mm256_cvtepi32_epi64(__m128i a) {
// CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
return _mm256_cvtepi32_epi64(a);
}
-
TEST_CONSTEXPR(match_v4di(_mm256_cvtepi32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), -70000, 2, -1, 0));
__m256i test_mm256_cvtepu8_epi16(__m128i a) {
@@ -378,7 +372,6 @@ __m256i test_mm256_cvtepu8_epi16(__m128i a) {
// CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
return _mm256_cvtepu8_epi16(a);
}
-
TEST_CONSTEXPR(match_v16hi(_mm256_cvtepu8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252, 5, 250, 7, 248, 9, 246, 11, 244));
__m256i test_mm256_cvtepu8_epi32(__m128i a) {
@@ -387,7 +380,6 @@ __m256i test_mm256_cvtepu8_epi32(__m128i a) {
// CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
return _mm256_cvtepu8_epi32(a);
}
-
TEST_CONSTEXPR(match_v8si(_mm256_cvtepu8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252));
__m256i test_mm256_cvtepu8_epi64(__m128i a) {
@@ -396,7 +388,6 @@ __m256i test_mm256_cvtepu8_epi64(__m128i a) {
// CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
return _mm256_cvtepu8_epi64(a);
}
-
TEST_CONSTEXPR(match_v4di(_mm256_cvtepu8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0));
__m256i test_mm256_cvtepu16_epi32(__m128i a) {
@@ -404,7 +395,6 @@ __m256i test_mm256_cvtepu16_epi32(__m128i a) {
// CHECK: zext <8 x i16> {{.*}} to <8 x i32>
return _mm256_cvtepu16_epi32(a);
}
-
TEST_CONSTEXPR(match_v8si(_mm256_cvtepu16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0, 1, 65534, 3, 65532));
__m256i test_mm256_cvtepu16_epi64(__m128i a) {
@@ -413,7 +403,6 @@ __m256i test_mm256_cvtepu16_epi64(__m128i a) {
// CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
return _mm256_cvtepu16_epi64(a);
}
-
TEST_CONSTEXPR(match_v4di(_mm256_cvtepu16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0));
__m256i test_mm256_cvtepu32_epi64(__m128i a) {
@@ -421,7 +410,6 @@ __m256i test_mm256_cvtepu32_epi64(__m128i a) {
// CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
return _mm256_cvtepu32_epi64(a);
}
-
TEST_CONSTEXPR(match_v4di(_mm256_cvtepu32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), 4294897296, 2, 4294967295, 0));
__m128i test0_mm256_extracti128_si256_0(__m256i a) {
@@ -1120,24 +1108,28 @@ __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
// CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_sllv_epi32(a, b);
}
+TEST_CONSTEXPR(match_v4si(_mm_sllv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 2, -8, 24, 0));
__m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_sllv_epi32
// CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_sllv_epi32(a, b);
}
+TEST_CONSTEXPR(match_v8si(_mm256_sllv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 2, -8, 24, -64, 0, 0, 0, 0));
__m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_sllv_epi64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
return _mm_sllv_epi64(a, b);
}
+TEST_CONSTEXPR(match_m128i(_mm_sllv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 256, 0x8000000000000000ULL));
__m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_sllv_epi64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
return _mm256_sllv_epi64(a, b);
}
+TEST_CONSTEXPR(match_m256i(_mm256_sllv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 2, -8, 24, 0));
__m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
// CHECK-LABEL: test_mm256_sra_epi16
@@ -1180,12 +1172,14 @@ __m128i test_mm_srav_epi32(__m128i a, __m128i b) {
// CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_srav_epi32(a, b);
}
+TEST_CONSTEXPR(match_v4si(_mm_srav_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, -1, 0, -1));
__m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_srav_epi32
// CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_srav_epi32(a, b);
}
+TEST_CONSTEXPR(match_v8si(_mm256_srav_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, -1, 0, -1, 0, -1, 0, -1));
__m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
// CHECK-LABEL: test_mm256_srl_epi16
@@ -1252,24 +1246,28 @@ __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
// CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_srlv_epi32(a, b);
}
+TEST_CONSTEXPR(match_v4si(_mm_srlv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, 1073741823, 0, 0));
__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_srlv_epi32
// CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_srlv_epi32(a, b);
}
+TEST_CONSTEXPR(match_v8si(_mm256_srlv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3,...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions c,cpp,h -- clang/lib/AST/ExprConstant.cpp clang/lib/Headers/avx2intrin.h clang/test/CodeGen/X86/avx2-builtins.c View the diff from clang-format here.diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 9b934753b..261b3f490 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11679,7 +11679,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case clang::X86::BI__builtin_ia32_psrlv2di:
case clang::X86::BI__builtin_ia32_psrlv4di:
case clang::X86::BI__builtin_ia32_psrlv4si:
- case clang::X86::BI__builtin_ia32_psrlv8si:{
+ case clang::X86::BI__builtin_ia32_psrlv8si: {
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index ce5b2b754..a23b8e0bb 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -3722,8 +3722,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_sllv_epi32(__m256i __X, __m256i __Y)
-{
+_mm256_sllv_epi32(__m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
}
@@ -3744,8 +3743,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y)
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_sllv_epi32(__m128i __X, __m128i __Y)
-{
+_mm_sllv_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
}
@@ -3766,8 +3764,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y)
/// bits).
/// \returns A 256-bit vector of [4 x i64] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_sllv_epi64(__m256i __X, __m256i __Y)
-{
+_mm256_sllv_epi64(__m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
}
@@ -3788,8 +3785,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y)
/// bits).
/// \returns A 128-bit vector of [2 x i64] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_sllv_epi64(__m128i __X, __m128i __Y)
-{
+_mm_sllv_epi64(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
}
@@ -3811,8 +3807,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y)
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srav_epi32(__m256i __X, __m256i __Y)
-{
+_mm256_srav_epi32(__m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
}
@@ -3834,8 +3829,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y)
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srav_epi32(__m128i __X, __m128i __Y)
-{
+_mm_srav_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
}
@@ -3856,8 +3850,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y)
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srlv_epi32(__m256i __X, __m256i __Y)
-{
+_mm256_srlv_epi32(__m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
}
@@ -3878,8 +3871,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y)
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srlv_epi32(__m128i __X, __m128i __Y)
-{
+_mm_srlv_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
}
@@ -3900,8 +3892,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y)
/// bits).
/// \returns A 256-bit vector of [4 x i64] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srlv_epi64(__m256i __X, __m256i __Y)
-{
+_mm256_srlv_epi64(__m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
}
@@ -3922,8 +3913,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y)
/// bits).
/// \returns A 128-bit vector of [2 x i64] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srlv_epi64(__m128i __X, __m128i __Y)
-{
+_mm_srlv_epi64(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…sics to be used in constexpr Followup to llvm#154780
This handles constant folding for the AVX2 per-element shift intrinsics, which handle out of bounds shift amounts (logical result = 0, arithmetic result = signbit splat)
AVX512 intrinsics will follow in follow up patches
First stage of #154287