Skip to content

Commit dcf11c5

Browse files
committed
[ARM][AArch64] Complex addition Neon intrinsics for Armv8.3-A
Summary: Add support for vcadd_* family of intrinsics. This set of intrinsics is available in Armv8.3-A. The fp16 versions require the FP16 extension, which has been available (opt-in) since Armv8.2-A. Reviewers: t.p.northover Reviewed By: t.p.northover Subscribers: t.p.northover, kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D70862
1 parent af0babc commit dcf11c5

File tree

13 files changed

+328
-1
lines changed

13 files changed

+328
-1
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,3 +1673,21 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
16731673
def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>;
16741674
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
16751675
}
1676+
1677+
// v8.3-A Vector complex addition intrinsics
1678+
let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
1679+
def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">;
1680+
def VCADD_ROT270_FP16 : SInst<"vcadd_rot270", "...", "h">;
1681+
def VCADDQ_ROT90_FP16 : SInst<"vcaddq_rot90", "QQQ", "h">;
1682+
def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">;
1683+
}
1684+
let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
1685+
def VCADD_ROT90 : SInst<"vcadd_rot90", "...", "f">;
1686+
def VCADD_ROT270 : SInst<"vcadd_rot270", "...", "f">;
1687+
def VCADDQ_ROT90 : SInst<"vcaddq_rot90", "QQQ", "f">;
1688+
def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">;
1689+
}
1690+
let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
1691+
def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">;
1692+
def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
1693+
}

clang/lib/Basic/Targets/AArch64.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ void AArch64TargetInfo::getTargetDefinesARMV82A(const LangOptions &Opts,
158158

159159
void AArch64TargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
160160
MacroBuilder &Builder) const {
161+
Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1");
161162
Builder.defineMacro("__ARM_FEATURE_JCVT", "1");
162163
// Also include the Armv8.2 defines
163164
getTargetDefinesARMV82A(Opts, Builder);

clang/lib/Basic/Targets/ARM.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,13 @@ void ARMTargetInfo::getTargetDefinesARMV82A(const LangOptions &Opts,
580580
getTargetDefinesARMV81A(Opts, Builder);
581581
}
582582

583+
void ARMTargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
584+
MacroBuilder &Builder) const {
585+
// Also include the ARMv8.2-A defines
586+
Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1");
587+
getTargetDefinesARMV82A(Opts, Builder);
588+
}
589+
583590
void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
584591
MacroBuilder &Builder) const {
585592
// Target identification.
@@ -809,6 +816,11 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
809816
case llvm::ARM::ArchKind::ARMV8_2A:
810817
getTargetDefinesARMV82A(Opts, Builder);
811818
break;
819+
case llvm::ARM::ArchKind::ARMV8_3A:
820+
case llvm::ARM::ArchKind::ARMV8_4A:
821+
case llvm::ARM::ArchKind::ARMV8_5A:
822+
getTargetDefinesARMV83A(Opts, Builder);
823+
break;
812824
}
813825
}
814826

clang/lib/Basic/Targets/ARM.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,10 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
148148

149149
void getTargetDefinesARMV81A(const LangOptions &Opts,
150150
MacroBuilder &Builder) const;
151-
152151
void getTargetDefinesARMV82A(const LangOptions &Opts,
153152
MacroBuilder &Builder) const;
153+
void getTargetDefinesARMV83A(const LangOptions &Opts,
154+
MacroBuilder &Builder) const;
154155
void getTargetDefines(const LangOptions &Opts,
155156
MacroBuilder &Builder) const override;
156157

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4454,6 +4454,10 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
44544454
NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
44554455
NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
44564456
NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
4457+
NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
4458+
NEONMAP1(vcadd_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
4459+
NEONMAP1(vcaddq_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
4460+
NEONMAP1(vcaddq_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
44574461
NEONMAP1(vcage_v, arm_neon_vacge, 0),
44584462
NEONMAP1(vcageq_v, arm_neon_vacge, 0),
44594463
NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
@@ -4727,6 +4731,10 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
47274731
NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
47284732
NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
47294733
NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),
4734+
NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
4735+
NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
4736+
NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
4737+
NEONMAP1(vcaddq_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
47304738
NEONMAP1(vcage_v, aarch64_neon_facge, 0),
47314739
NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
47324740
NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.3-a+fp16 %s -S -emit-llvm -o - | FileCheck %s
2+
3+
#include <arm_neon.h>
4+
5+
void foo16x4_rot90(float16x4_t a, float16x4_t b)
6+
{
7+
// CHECK: call <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16
8+
float16x4_t result = vcadd_rot90_f16(a, b);
9+
}
10+
11+
void foo32x2_rot90(float32x2_t a, float32x2_t b)
12+
{
13+
// CHECK: call <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32
14+
float32x2_t result = vcadd_rot90_f32(a, b);
15+
}
16+
17+
void foo16x8_rot90(float16x8_t a, float16x8_t b)
18+
{
19+
// CHECK: call <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16
20+
float16x8_t result = vcaddq_rot90_f16(a, b);
21+
}
22+
23+
void foo32x4_rot90(float32x4_t a, float32x4_t b)
24+
{
25+
// CHECK: call <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32
26+
float32x4_t result = vcaddq_rot90_f32(a, b);
27+
}
28+
29+
void foo64x2_rot90(float64x2_t a, float64x2_t b)
30+
{
31+
// CHECK: call <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64
32+
float64x2_t result = vcaddq_rot90_f64(a, b);
33+
}
34+
35+
void foo16x4_rot270(float16x4_t a, float16x4_t b)
36+
{
37+
// CHECK: call <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16
38+
float16x4_t result = vcadd_rot270_f16(a, b);
39+
}
40+
41+
void foo32x2_rot270(float32x2_t a, float32x2_t b)
42+
{
43+
// CHECK: call <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32
44+
float32x2_t result = vcadd_rot270_f32(a, b);
45+
}
46+
47+
void foo16x8_rot270(float16x8_t a, float16x8_t b)
48+
{
49+
// CHECK: call <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16
50+
float16x8_t result = vcaddq_rot270_f16(a, b);
51+
}
52+
53+
void foo32x4_rot270(float32x4_t a, float32x4_t b)
54+
{
55+
// CHECK: call <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32
56+
float32x4_t result = vcaddq_rot270_f32(a, b);
57+
}
58+
59+
void foo64x2_rot270(float64x2_t a, float64x2_t b)
60+
{
61+
// CHECK: call <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64
62+
float64x2_t result = vcaddq_rot270_f64(a, b);
63+
}

clang/test/CodeGen/arm-neon-vcadd.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// RUN: %clang -target arm-arm-none-eabi -march=armv8.3-a+fp16 %s -S -emit-llvm -o - | opt -S -sroa | FileCheck %s
2+
3+
#include <arm_neon.h>
4+
5+
void foo16x4_rot90(float16x4_t a, float16x4_t b)
6+
{
7+
// CHECK: call <4 x half> @llvm.arm.neon.vcadd.rot90.v4f16
8+
float16x4_t result = vcadd_rot90_f16(a, b);
9+
}
10+
11+
void foo32x2_rot90(float32x2_t a, float32x2_t b)
12+
{
13+
// CHECK: call <2 x float> @llvm.arm.neon.vcadd.rot90.v2f32
14+
float32x2_t result = vcadd_rot90_f32(a, b);
15+
}
16+
17+
void foo16x8_rot90(float16x8_t a, float16x8_t b)
18+
{
19+
// CHECK: call <8 x half> @llvm.arm.neon.vcadd.rot90.v8f16
20+
float16x8_t result = vcaddq_rot90_f16(a, b);
21+
}
22+
23+
void foo32x4_rot90(float32x4_t a, float32x4_t b)
24+
{
25+
// CHECK: call <4 x float> @llvm.arm.neon.vcadd.rot90.v4f32
26+
float32x4_t result = vcaddq_rot90_f32(a, b);
27+
}
28+
29+
void foo16x4_rot270(float16x4_t a, float16x4_t b)
30+
{
31+
// CHECK: call <4 x half> @llvm.arm.neon.vcadd.rot270.v4f16
32+
float16x4_t result = vcadd_rot270_f16(a, b);
33+
}
34+
35+
void foo32x2_rot270(float32x2_t a, float32x2_t b)
36+
{
37+
// CHECK: call <2 x float> @llvm.arm.neon.vcadd.rot270.v2f32
38+
float32x2_t result = vcadd_rot270_f32(a, b);
39+
}
40+
41+
void foo16x8_rot270(float16x8_t a, float16x8_t b)
42+
{
43+
// CHECK: call <8 x half> @llvm.arm.neon.vcadd.rot270.v8f16
44+
float16x8_t result = vcaddq_rot270_f16(a, b);
45+
}
46+
47+
void foo32x4_rot270(float32x4_t a, float32x4_t b)
48+
{
49+
// CHECK: call <4 x float> @llvm.arm.neon.vcadd.rot270.v4f32
50+
float32x4_t result = vcaddq_rot270_f32(a, b);
51+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,10 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
446446
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
447447
def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic;
448448
def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic;
449+
450+
// v8.3-A Floating-point complex add
451+
def int_aarch64_neon_vcadd_rot90 : AdvSIMD_2VectorArg_Intrinsic;
452+
def int_aarch64_neon_vcadd_rot270 : AdvSIMD_2VectorArg_Intrinsic;
449453
}
450454

451455
let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".

llvm/include/llvm/IR/IntrinsicsARM.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,10 @@ def int_arm_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
778778
def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
779779
def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>;
780780

781+
// v8.3-A Floating-point complex add
782+
def int_arm_neon_vcadd_rot90 : Neon_2Arg_Intrinsic;
783+
def int_arm_neon_vcadd_rot270 : Neon_2Arg_Intrinsic;
784+
781785
// GNU eabi mcount
782786
def int_arm_gnu_eabi_mcount : Intrinsic<[],
783787
[],

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,29 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
757757
defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
758758
null_frag>;
759759

760+
let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
761+
def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
762+
(FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 0))>;
763+
def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot270 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
764+
(FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 1))>;
765+
def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot90 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
766+
(FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 0))>;
767+
def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
768+
(FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>;
769+
}
770+
let Predicates = [HasComplxNum, HasNEON] in {
771+
def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
772+
(FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>;
773+
def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot270 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
774+
(FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 1))>;
775+
foreach Ty = [v4f32, v2f64] in {
776+
def : Pat<(Ty (int_aarch64_neon_vcadd_rot90 (Ty V128:$Rn), (Ty V128:$Rm))),
777+
(!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 0))>;
778+
def : Pat<(Ty (int_aarch64_neon_vcadd_rot270 (Ty V128:$Rn), (Ty V128:$Rm))),
779+
(!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 1))>;
780+
}
781+
}
782+
760783
// v8.3a Pointer Authentication
761784
// These instructions inhabit part of the hint space and so can be used for
762785
// armv8 targets

llvm/lib/Target/ARM/ARMInstrNEON.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5012,6 +5012,27 @@ defm VCMLA : N3VCP8ComplexTied<1, 0, "vcmla", null_frag>;
50125012
defm VCADD : N3VCP8ComplexOdd<1, 0, 0, "vcadd", null_frag>;
50135013
defm VCMLA : N3VCP8ComplexTiedLane<0, "vcmla", null_frag>;
50145014

5015+
let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
5016+
def : Pat<(v4f16 (int_arm_neon_vcadd_rot90 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
5017+
(VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 0))>;
5018+
def : Pat<(v4f16 (int_arm_neon_vcadd_rot270 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
5019+
(VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 1))>;
5020+
def : Pat<(v8f16 (int_arm_neon_vcadd_rot90 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
5021+
(VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 0))>;
5022+
def : Pat<(v8f16 (int_arm_neon_vcadd_rot270 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
5023+
(VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 1))>;
5024+
}
5025+
let Predicates = [HasNEON,HasV8_3a] in {
5026+
def : Pat<(v2f32 (int_arm_neon_vcadd_rot90 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
5027+
(VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 0))>;
5028+
def : Pat<(v2f32 (int_arm_neon_vcadd_rot270 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
5029+
(VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 1))>;
5030+
def : Pat<(v4f32 (int_arm_neon_vcadd_rot90 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
5031+
(VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 0))>;
5032+
def : Pat<(v4f32 (int_arm_neon_vcadd_rot270 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
5033+
(VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 1))>;
5034+
}
5035+
50155036
// Vector Subtract Operations.
50165037

50175038
// VSUB : Vector Subtract (integer and floating-point)
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
; RUN: llc %s -mtriple=aarch64 -mattr=+v8.3a,+fullfp16 -o - | FileCheck %s
2+
3+
define <4 x half> @foo16x4_rot(<4 x half> %a, <4 x half> %b) {
4+
entry:
5+
; CHECK-LABEL: foo16x4_rot
6+
; CHECK-DAG: fcadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #90
7+
; CHECK-DAG: fcadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #270
8+
%vcadd_rot90_v2.i = tail call <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16(<4 x half> %a, <4 x half> %b)
9+
%vcadd_rot270_v2.i = tail call <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16(<4 x half> %a, <4 x half> %b)
10+
%add = fadd <4 x half> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
11+
ret <4 x half> %add
12+
}
13+
14+
define <2 x float> @foo32x2_rot(<2 x float> %a, <2 x float> %b) {
15+
entry:
16+
; CHECK-LABEL: foo32x2_rot
17+
; CHECK-DAG: fcadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #90
18+
; CHECK-DAG: fcadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #270
19+
%vcadd_rot90_v2.i = tail call <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32(<2 x float> %a, <2 x float> %b)
20+
%vcadd_rot270_v2.i = tail call <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32(<2 x float> %a, <2 x float> %b)
21+
%add = fadd <2 x float> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
22+
ret <2 x float> %add
23+
}
24+
25+
define <8 x half> @foo16x8_rot(<8 x half> %a, <8 x half> %b) {
26+
entry:
27+
; CHECK-LABEL: foo16x8_rot
28+
; CHECK-DAG: fcadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, #90
29+
; CHECK-DAG: fcadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, #270
30+
%vcaddq_rot90_v2.i = tail call <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16(<8 x half> %a, <8 x half> %b)
31+
%vcaddq_rot270_v2.i = tail call <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16(<8 x half> %a, <8 x half> %b)
32+
%add = fadd <8 x half> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
33+
ret <8 x half> %add
34+
}
35+
36+
define <4 x float> @foo32x4_rot(<4 x float> %a, <4 x float> %b) {
37+
entry:
38+
; CHECK-LABEL: foo32x4_rot
39+
; CHECK-DAG: fcadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, #90
40+
; CHECK-DAG: fcadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, #270
41+
%vcaddq_rot90_v2.i = tail call <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32(<4 x float> %a, <4 x float> %b)
42+
%vcaddq_rot270_v2.i = tail call <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32(<4 x float> %a, <4 x float> %b)
43+
%add = fadd <4 x float> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
44+
ret <4 x float> %add
45+
}
46+
47+
define <2 x double> @foo64x2_rot(<2 x double> %a, <2 x double> %b) {
48+
entry:
49+
; CHECK-LABEL: foo64x2_rot
50+
; CHECK-DAG: fcadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, #90
51+
; CHECK-DAG: fcadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, #270
52+
%vcaddq_rot90_v2.i = tail call <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64(<2 x double> %a, <2 x double> %b)
53+
%vcaddq_rot270_v2.i = tail call <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64(<2 x double> %a, <2 x double> %b)
54+
%add = fadd <2 x double> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
55+
ret <2 x double> %add
56+
}
57+
58+
declare <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16(<4 x half>, <4 x half>)
59+
declare <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16(<4 x half>, <4 x half>)
60+
declare <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32(<2 x float>, <2 x float>)
61+
declare <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32(<2 x float>, <2 x float>)
62+
declare <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16(<8 x half>, <8 x half>)
63+
declare <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16(<8 x half>, <8 x half>)
64+
declare <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32(<4 x float>, <4 x float>)
65+
declare <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32(<4 x float>, <4 x float>)
66+
declare <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64(<2 x double>, <2 x double>)
67+
declare <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64(<2 x double>, <2 x double>)

0 commit comments

Comments
 (0)