Skip to content

Commit bb4da94

Browse files
committed
[ARM,CDE] Implement predicated Q-register CDE intrinsics
Summary: This patch implements the following CDE intrinsics: T __arm_vcx1q_m(int coproc, T inactive, uint32_t imm, mve_pred_t p); T __arm_vcx2q_m(int coproc, T inactive, U n, uint32_t imm, mve_pred_t p); T __arm_vcx3q_m(int coproc, T inactive, U n, V m, uint32_t imm, mve_pred_t p); T __arm_vcx1qa_m(int coproc, T acc, uint32_t imm, mve_pred_t p); T __arm_vcx2qa_m(int coproc, T acc, U n, uint32_t imm, mve_pred_t p); T __arm_vcx3qa_m(int coproc, T acc, U n, V m, uint32_t imm, mve_pred_t p); The intrinsics are not part of the released ACLE spec, but internally at Arm we have reached consensus to add them to the next ACLE release. Reviewers: simon_tatham, MarkMurrayARM, ostannard, dmgreen Reviewed By: simon_tatham Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76610
1 parent 8f237f9 commit bb4da94

File tree

5 files changed

+276
-1
lines changed

5 files changed

+276
-1
lines changed

clang/include/clang/Basic/arm_cde.td

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,40 @@ def vcx3qa : FunctionMacro<
189189
"__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), "
190190
"__arm_vreinterpretq_u8(m), (imm))">;
191191

192+
class CDEIntrinsicMasked<string irname, dag argsReg, dag imm, dag cgArgs>
193+
: CDEIntrinsic<Vector,
194+
!con((args imm_coproc:$cp, Vector:$inactive_or_acc),
195+
argsReg, imm, (args Predicate:$pred)),
196+
!con((CDEIRInt<irname # "_predicated", [Vector,Predicate]>
197+
$cp, $inactive_or_acc), cgArgs, (? $imm, $pred))> {
198+
let params = T.All;
199+
let polymorphicOnly = 1;
200+
}
201+
202+
def vcx1q_m : CDEIntrinsicMasked<"vcx1q", (args), (args imm_12b:$imm), (?)>;
203+
def vcx1qa_m : CDEIntrinsicMasked<"vcx1qa", (args), (args imm_12b:$imm), (?)>;
204+
205+
multiclass VCXPredicated<dag argsReg, dag imm, dag cgArgs,
206+
list<string> macroArgs, string macro> {
207+
def _m_impl : CDEIntrinsicMasked<NAME, argsReg, imm, cgArgs>;
208+
def a_m_impl : CDEIntrinsicMasked<NAME#"a", argsReg, imm, cgArgs>;
209+
210+
def _m: FunctionMacro<
211+
!listconcat(["cp", "inactive"], macroArgs, ["imm", "pred"]),
212+
"__arm_"#NAME#"_m_impl((cp), (inactive), "#macro#" (imm), (pred))">;
213+
def a_m: FunctionMacro<
214+
!listconcat(["cp", "acc"], macroArgs, ["imm", "pred"]),
215+
"__arm_"#NAME#"a_m_impl((cp), (acc), "#macro#" (imm), (pred))">;
216+
}
217+
218+
defm vcx2q :
219+
VCXPredicated<(args v16u8:$n), (args imm_7b:$imm), (? $n), ["n"],
220+
"__arm_vreinterpretq_u8(n),">;
221+
defm vcx3q :
222+
VCXPredicated<(args v16u8:$n, v16u8:$m), (args imm_4b:$imm), (? $n, $m),
223+
["n", "m"], "__arm_vreinterpretq_u8(n), "
224+
"__arm_vreinterpretq_u8(m),">;
225+
192226
// vreinterpretq intrinsics required by the ACLE CDE specification
193227

194228
foreach desttype = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32] in {

clang/test/CodeGen/arm-cde-vec.c

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,75 @@ uint64x2_t test_vcx3q(uint64x2_t n, float32x4_t m) {
102102
int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) {
103103
return __arm_vcx3qa(1, acc, n, m, 13);
104104
}
105+
106+
// CHECK-LABEL: @test_vcx1q_m(
107+
// CHECK-NEXT: entry:
108+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
109+
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
110+
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 0, <8 x i16> [[INACTIVE:%.*]], i32 1111, <8 x i1> [[TMP1]])
111+
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
112+
//
113+
uint16x8_t test_vcx1q_m(uint16x8_t inactive, mve_pred16_t p) {
114+
return __arm_vcx1q_m(0, inactive, 1111, p);
115+
}
116+
117+
// CHECK-LABEL: @test_vcx1qa_m(
118+
// CHECK-NEXT: entry:
119+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
120+
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
121+
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 1, <16 x i8> [[ACC:%.*]], i32 1112, <16 x i1> [[TMP1]])
122+
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
123+
//
124+
uint8x16_t test_vcx1qa_m(uint8x16_t acc, mve_pred16_t p) {
125+
return __arm_vcx1qa_m(1, acc, 1112, p);
126+
}
127+
128+
// CHECK-LABEL: @test_vcx2q_m(
129+
// CHECK-NEXT: entry:
130+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8>
131+
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
132+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
133+
// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 0, <4 x i32> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], i32 111, <4 x i1> [[TMP2]])
134+
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
135+
//
136+
int32x4_t test_vcx2q_m(int32x4_t inactive, float32x4_t n, mve_pred16_t p) {
137+
return __arm_vcx2q_m(0, inactive, n, 111, p);
138+
}
139+
140+
// CHECK-LABEL: @test_vcx2qa_m(
141+
// CHECK-NEXT: entry:
142+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8>
143+
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
144+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
145+
// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 0, <4 x float> [[ACC:%.*]], <16 x i8> [[TMP0]], i32 112, <4 x i1> [[TMP2]])
146+
// CHECK-NEXT: ret <4 x float> [[TMP3]]
147+
//
148+
float32x4_t test_vcx2qa_m(float32x4_t acc, float16x8_t n, mve_pred16_t p) {
149+
return __arm_vcx2qa_m(0, acc, n, 112, p);
150+
}
151+
152+
// CHECK-LABEL: @test_vcx3q_m(
153+
// CHECK-NEXT: entry:
154+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8>
155+
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
156+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
157+
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 1, <2 x i64> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[M:%.*]], i32 11, <4 x i1> [[TMP2]])
158+
// CHECK-NEXT: ret <2 x i64> [[TMP3]]
159+
//
160+
int64x2_t test_vcx3q_m(int64x2_t inactive, float32x4_t n, int8x16_t m, mve_pred16_t p) {
161+
return __arm_vcx3q_m(1, inactive, n, m, 11, p);
162+
}
163+
164+
// CHECK-LABEL: @test_vcx3qa_m(
165+
// CHECK-NEXT: entry:
166+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8>
167+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8>
168+
// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
169+
// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
170+
// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 1, <4 x float> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12, <4 x i1> [[TMP3]])
171+
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <8 x half>
172+
// CHECK-NEXT: ret <8 x half> [[TMP5]]
173+
//
174+
float16x8_t test_vcx3qa_m(float32x4_t inactive, float16x8_t n, uint32x4_t m, mve_pred16_t p) {
175+
return __arm_vcx3qa_m(1, inactive, n, m, 12, p);
176+
}

llvm/include/llvm/IR/IntrinsicsARM.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,6 +1332,17 @@ multiclass CDEVCXVecIntrinsics<list<LLVMType> args> {
13321332
!listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */],
13331333
args, [llvm_i32_ty /* imm */]),
13341334
[IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
1335+
1336+
def _predicated : Intrinsic<
1337+
[llvm_anyvector_ty],
1338+
!listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* inactive */],
1339+
args, [llvm_i32_ty /* imm */, llvm_anyvector_ty /* mask */]),
1340+
[IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
1341+
def a_predicated : Intrinsic<
1342+
[llvm_anyvector_ty],
1343+
!listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* acc */],
1344+
args, [llvm_i32_ty /* imm */, llvm_anyvector_ty /* mask */]),
1345+
[IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
13351346
}
13361347

13371348
defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>;

llvm/lib/Target/ARM/ARMInstrCDE.td

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,3 +606,61 @@ let Predicates = [HasCDE, HasMVEInt] in {
606606
(v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
607607
imm_4b:$imm))>;
608608
}
609+
610+
multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
611+
def : Pat<(VTI.Vec (int_arm_cde_vcx1q_predicated timm:$coproc,
612+
(VTI.Vec MQPR:$inactive), timm:$imm,
613+
(VTI.Pred VCCR:$pred))),
614+
(VTI.Vec (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm, ARMVCCThen,
615+
(VTI.Pred VCCR:$pred),
616+
(VTI.Vec MQPR:$inactive)))>;
617+
def : Pat<(VTI.Vec (int_arm_cde_vcx1qa_predicated timm:$coproc,
618+
(VTI.Vec MQPR:$acc), timm:$imm,
619+
(VTI.Pred VCCR:$pred))),
620+
(VTI.Vec (CDE_VCX1A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
621+
imm_12b:$imm, ARMVCCThen,
622+
(VTI.Pred VCCR:$pred)))>;
623+
624+
def : Pat<(VTI.Vec (int_arm_cde_vcx2q_predicated timm:$coproc,
625+
(VTI.Vec MQPR:$inactive),
626+
(v16i8 MQPR:$n), timm:$imm,
627+
(VTI.Pred VCCR:$pred))),
628+
(VTI.Vec (CDE_VCX2_vec p_imm:$coproc, (v16i8 MQPR:$n),
629+
imm_7b:$imm, ARMVCCThen,
630+
(VTI.Pred VCCR:$pred),
631+
(VTI.Vec MQPR:$inactive)))>;
632+
def : Pat<(VTI.Vec (int_arm_cde_vcx2qa_predicated timm:$coproc,
633+
(VTI.Vec MQPR:$acc),
634+
(v16i8 MQPR:$n), timm:$imm,
635+
(VTI.Pred VCCR:$pred))),
636+
(VTI.Vec (CDE_VCX2A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
637+
(v16i8 MQPR:$n), timm:$imm, ARMVCCThen,
638+
(VTI.Pred VCCR:$pred)))>;
639+
640+
def : Pat<(VTI.Vec (int_arm_cde_vcx3q_predicated timm:$coproc,
641+
(VTI.Vec MQPR:$inactive),
642+
(v16i8 MQPR:$n), (v16i8 MQPR:$m),
643+
timm:$imm,
644+
(VTI.Pred VCCR:$pred))),
645+
(VTI.Vec (CDE_VCX3_vec p_imm:$coproc, (v16i8 MQPR:$n),
646+
(v16i8 MQPR:$m),
647+
imm_4b:$imm, ARMVCCThen,
648+
(VTI.Pred VCCR:$pred),
649+
(VTI.Vec MQPR:$inactive)))>;
650+
def : Pat<(VTI.Vec (int_arm_cde_vcx3qa_predicated timm:$coproc,
651+
(VTI.Vec MQPR:$acc),
652+
(v16i8 MQPR:$n), (v16i8 MQPR:$m), timm:$imm,
653+
(VTI.Pred VCCR:$pred))),
654+
(VTI.Vec (CDE_VCX3A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
655+
(v16i8 MQPR:$n), (v16i8 MQPR:$m),
656+
imm_4b:$imm, ARMVCCThen,
657+
(VTI.Pred VCCR:$pred)))>;
658+
}
659+
660+
let Predicates = [HasCDE, HasMVEInt] in
661+
foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in
662+
defm : VCXPredicatedPat_m<VTI>;
663+
664+
let Predicates = [HasCDE, HasMVEFloat] in
665+
foreach VTI = [ MVE_v8f16, MVE_v4f32 ] in
666+
defm : VCXPredicatedPat_m<VTI>;

llvm/test/CodeGen/Thumb2/cde-vec.ll

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
2+
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
33

44
declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
55
declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
@@ -112,3 +112,103 @@ entry:
112112
%2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
113113
ret <16 x i8> %2
114114
}
115+
116+
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
117+
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
118+
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
119+
declare <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 immarg, <8 x i16>, i32 immarg, <8 x i1>)
120+
declare <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 immarg, <16 x i8>, i32 immarg, <16 x i1>)
121+
declare <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 immarg, <4 x i32>, <16 x i8>, i32 immarg, <4 x i1>)
122+
declare <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, i32 immarg, <4 x i1>)
123+
declare <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>)
124+
declare <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>)
125+
126+
define arm_aapcs_vfpcc <8 x i16> @test_vcx1q_m(<8 x i16> %inactive, i16 zeroext %p) {
127+
; CHECK-LABEL: test_vcx1q_m:
128+
; CHECK: @ %bb.0: @ %entry
129+
; CHECK-NEXT: vmsr p0, r0
130+
; CHECK-NEXT: vpst
131+
; CHECK-NEXT: vcx1t p0, q0, #1111
132+
; CHECK-NEXT: bx lr
133+
entry:
134+
%0 = zext i16 %p to i32
135+
%1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
136+
%2 = call <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 0, <8 x i16> %inactive, i32 1111, <8 x i1> %1)
137+
ret <8 x i16> %2
138+
}
139+
140+
define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_m(<16 x i8> %acc, i16 zeroext %p) {
141+
; CHECK-LABEL: test_vcx1qa_m:
142+
; CHECK: @ %bb.0: @ %entry
143+
; CHECK-NEXT: vmsr p0, r0
144+
; CHECK-NEXT: vpst
145+
; CHECK-NEXT: vcx1at p1, q0, #1112
146+
; CHECK-NEXT: bx lr
147+
entry:
148+
%0 = zext i16 %p to i32
149+
%1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
150+
%2 = call <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 1, <16 x i8> %acc, i32 1112, <16 x i1> %1)
151+
ret <16 x i8> %2
152+
}
153+
154+
define arm_aapcs_vfpcc <4 x i32> @test_vcx2q_m(<4 x i32> %inactive, <4 x float> %n, i16 zeroext %p) {
155+
; CHECK-LABEL: test_vcx2q_m:
156+
; CHECK: @ %bb.0: @ %entry
157+
; CHECK-NEXT: vmsr p0, r0
158+
; CHECK-NEXT: vpst
159+
; CHECK-NEXT: vcx2t p0, q0, q1, #111
160+
; CHECK-NEXT: bx lr
161+
entry:
162+
%0 = bitcast <4 x float> %n to <16 x i8>
163+
%1 = zext i16 %p to i32
164+
%2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
165+
%3 = call <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 0, <4 x i32> %inactive, <16 x i8> %0, i32 111, <4 x i1> %2)
166+
ret <4 x i32> %3
167+
}
168+
169+
define arm_aapcs_vfpcc <4 x float> @test_vcx2qa_m(<4 x float> %acc, <8 x half> %n, i16 zeroext %p) {
170+
; CHECK-LABEL: test_vcx2qa_m:
171+
; CHECK: @ %bb.0: @ %entry
172+
; CHECK-NEXT: vmsr p0, r0
173+
; CHECK-NEXT: vpst
174+
; CHECK-NEXT: vcx2at p0, q0, q1, #112
175+
; CHECK-NEXT: bx lr
176+
entry:
177+
%0 = bitcast <8 x half> %n to <16 x i8>
178+
%1 = zext i16 %p to i32
179+
%2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
180+
%3 = call <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 0, <4 x float> %acc, <16 x i8> %0, i32 112, <4 x i1> %2)
181+
ret <4 x float> %3
182+
}
183+
184+
define arm_aapcs_vfpcc <2 x i64> @test_vcx3q_m(<2 x i64> %inactive, <4 x float> %n, <16 x i8> %m, i16 zeroext %p) {
185+
; CHECK-LABEL: test_vcx3q_m:
186+
; CHECK: @ %bb.0: @ %entry
187+
; CHECK-NEXT: vmsr p0, r0
188+
; CHECK-NEXT: vpst
189+
; CHECK-NEXT: vcx3t p0, q0, q1, q2, #11
190+
; CHECK-NEXT: bx lr
191+
entry:
192+
%0 = bitcast <4 x float> %n to <16 x i8>
193+
%1 = zext i16 %p to i32
194+
%2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
195+
%3 = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, <16 x i8> %m, i32 11, <4 x i1> %2)
196+
ret <2 x i64> %3
197+
}
198+
199+
define arm_aapcs_vfpcc <8 x half> @test_vcx3qa_m(<4 x float> %inactive, <8 x half> %n, <4 x i32> %m, i16 zeroext %p) {
200+
; CHECK-LABEL: test_vcx3qa_m:
201+
; CHECK: @ %bb.0: @ %entry
202+
; CHECK-NEXT: vmsr p0, r0
203+
; CHECK-NEXT: vpst
204+
; CHECK-NEXT: vcx3at p0, q0, q1, q2, #12
205+
; CHECK-NEXT: bx lr
206+
entry:
207+
%0 = bitcast <8 x half> %n to <16 x i8>
208+
%1 = bitcast <4 x i32> %m to <16 x i8>
209+
%2 = zext i16 %p to i32
210+
%3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
211+
%4 = call <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 0, <4 x float> %inactive, <16 x i8> %0, <16 x i8> %1, i32 12, <4 x i1> %3)
212+
%5 = bitcast <4 x float> %4 to <8 x half>
213+
ret <8 x half> %5
214+
}

0 commit comments

Comments
 (0)