-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[NVPTX] Add max/minimumnum to ISel #155804
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Add direct support for the llvm maximumnum and minimumnum intrinsics, rather than lowering them to a sequence of compare + select instructions. The maximumnum and minimumnum intrinsics map directly to PTX max/min instructions. In future, the LLVM maxnum/minnum intrinsics might need some fix-ups for sNaN handling added, but currently, both llvm.maxnum and llvm.maximumnum will map directly to PTX "max" instructions.
@llvm/pr-subscribers-backend-nvptx Author: Lewis Crawford (LewisCrawford) ChangesAdd direct support for the LLVM The In future, the LLVM Patch is 20.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155804.diff 3 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 997c33f1f6a76..2025072efa0b9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -539,6 +539,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
case ISD::FMINNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
+ case ISD::FMAXIMUMNUM:
+ case ISD::FMINIMUMNUM:
IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
break;
case ISD::FEXP2:
@@ -985,7 +987,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
- for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
+ for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);
setFP16OperationAction(Op, MVT::f16, Legal, Promote);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4d6f7b3d96601..21d2ee368cb70 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -914,6 +914,8 @@ defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>;
defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>;
+defm MINIMUMNUM : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminimumnum>;
+defm MAXIMUMNUM : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaximumnum>;
def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
[SDNPCommutative]>;
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index e9635e9393984..3cbb6b680fc05 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -42,6 +42,14 @@ declare half @llvm.maximum.f16(half, half) #0
declare float @llvm.maximum.f32(float, float) #0
declare double @llvm.maximum.f64(double, double) #0
declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.minimumnum.f16(half, half) #0
+declare float @llvm.minimumnum.f32(float, float) #0
+declare double @llvm.minimumnum.f64(double, double) #0
+declare <2 x half> @llvm.minimumnum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.maximumnum.f16(half, half) #0
+declare float @llvm.maximumnum.f32(float, float) #0
+declare double @llvm.maximumnum.f64(double, double) #0
+declare <2 x half> @llvm.maximumnum.v2f16(<2 x half>, <2 x half>) #0
declare float @llvm.fma.f32(float, float, float) #0
declare double @llvm.fma.f64(double, double, double) #0
@@ -1486,6 +1494,410 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
ret <2 x half> %x
}
+; ---- minimumnum ----
+
+define half @minimumnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: minimumnum_half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0];
+; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: minimumnum_half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0];
+; CHECK-F16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1];
+; CHECK-F16-NEXT: min.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimumnum_half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-SM80-NOF16-NEXT: ret;
+ %x = call half @llvm.minimumnum.f16(half %a, half %b)
+ ret half %x
+}
+
+define float @minimumnum_float(float %a, float %b) {
+; CHECK-LABEL: minimumnum_float(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_param_1];
+; CHECK-NEXT: min.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.minimumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define float @minimumnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: minimumnum_float_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_ftz_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_ftz_param_1];
+; CHECK-NEXT: min.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.minimumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define double @minimumnum_double(double %a, double %b) {
+; CHECK-LABEL: minimumnum_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [minimumnum_double_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [minimumnum_double_param_1];
+; CHECK-NEXT: min.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
+ %x = call double @llvm.minimumnum.f64(double %a, double %b)
+ ret double %x
+}
+
+; TODO Improve the "Expand" path for minimumnum vectors on targets where
+; f16 is not supported. Ideally it should use two f32 minimumnums first instead of
+; fully expanding the minimumnum instruction into compare/select instructions.
+define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: minimumnum_v2half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
+; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
+; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
+; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
+; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
+; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: minimumnum_v2half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b32 %r1, [minimumnum_v2half_param_0];
+; CHECK-F16-NEXT: ld.param.b32 %r2, [minimumnum_v2half_param_1];
+; CHECK-F16-NEXT: min.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimumnum_v2half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
+; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
+; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-SM80-NOF16-NEXT: ret;
+ %x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %x
+}
+
+; ---- maximumnum ----
+
+define half @maximumnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: maximumnum_half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0];
+; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: maximumnum_half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0];
+; CHECK-F16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1];
+; CHECK-F16-NEXT: max.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximumnum_half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-SM80-NOF16-NEXT: ret;
+ %x = call half @llvm.maximumnum.f16(half %a, half %b)
+ ret half %x
+}
+
+define float @maximumnum_float(float %a, float %b) {
+; CHECK-LABEL: maximumnum_float(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_param_1];
+; CHECK-NEXT: max.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.maximumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define float @maximumnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: maximumnum_float_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_ftz_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_ftz_param_1];
+; CHECK-NEXT: max.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.maximumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define double @maximumnum_double(double %a, double %b) {
+; CHECK-LABEL: maximumnum_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [maximumnum_double_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [maximumnum_double_param_1];
+; CHECK-NEXT: max.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
+ %x = call double @llvm.maximumnum.f64(double %a, double %b)
+ ret double %x
+}
+
+; TODO Improve the "Expand" path for maximumnum vectors on targets where
+; f16 is not supported. Ideally it should use two f32 maximumnums first instead of
+; fully expanding the maximumnum instruction into compare/select instructions.
+define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: maximumnum_v2half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
+; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
+; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
+; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9;
+; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
+; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: maximumnum_v2half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b32 %r1, [maximumnum_v2half_param_0];
+; CHECK-F16-NEXT: ld.param.b32 %r2, [maximumnum_v2half_param_1];
+; CHECK-F16-NEXT: max.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximumnum_v2half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs1...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One minor tweak but otherwise LGTM
Add direct support for the LLVM
maximumnum
andminimumnum
intrinsics, rather than lowering them to a sequence of compare + select instructions.The
maximumnum
andminimumnum
intrinsics map directly to PTXmax
/min
instructions.In future, the LLVM
maxnum
/minnum
intrinsics might need some fix-ups for sNaN handling added, but currently, bothllvm.maxnum
andllvm.maximumnum
will map directly to PTXmax
instructions.