diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 997c33f1f6a76..83b58f74ae3f9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -539,6 +539,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, case ISD::FMINNUM_IEEE: case ISD::FMAXIMUM: case ISD::FMINIMUM: + case ISD::FMAXIMUMNUM: + case ISD::FMINIMUMNUM: IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; break; case ISD::FEXP2: @@ -985,7 +987,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (getOperationAction(ISD::FABS, MVT::bf16) == Promote) AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32); - for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { + for (const auto &Op : + {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) { setOperationAction(Op, MVT::f32, Legal); setOperationAction(Op, MVT::f64, Legal); setFP16OperationAction(Op, MVT::f16, Legal, Promote); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4d6f7b3d96601..f7de6d1a9e1f4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -294,7 +294,7 @@ multiclass ADD_SUB_INT_CARRY { // // Also defines ftz (flush subnormal inputs and results to sign-preserving // zero) variants for fp32 functions. -multiclass FMINIMUMMAXIMUM { +multiclass FMINIMUMMAXIMUM { defvar nan_str = !if(NaN, ".NaN", ""); if !not(NaN) then { def _f64_rr : @@ -910,8 +910,15 @@ defm FADD : F3_fma_component<"add", fadd>; defm FSUB : F3_fma_component<"sub", fsub>; defm FMUL : F3_fma_component<"mul", fmul>; -defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>; -defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; +def fminnum_or_fminimumnum : PatFrags<(ops node:$a, node:$b), + [(fminnum node:$a, node:$b), + (fminimumnum node:$a, node:$b)]>; +def fmaxnum_or_fmaximumnum : PatFrags<(ops node:$a, node:$b), + [(fmaxnum node:$a, node:$b), + (fmaximumnum node:$a, node:$b)]>; + +defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum_or_fminimumnum>; +defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum_or_fmaximumnum>; defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>; defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>; diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index e9635e9393984..3cbb6b680fc05 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -42,6 +42,14 @@ declare half @llvm.maximum.f16(half, half) #0 declare float @llvm.maximum.f32(float, float) #0 declare double @llvm.maximum.f64(double, double) #0 declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) #0 +declare half @llvm.minimumnum.f16(half, half) #0 +declare float @llvm.minimumnum.f32(float, float) #0 +declare double @llvm.minimumnum.f64(double, double) #0 +declare <2 x half> @llvm.minimumnum.v2f16(<2 x half>, <2 x half>) #0 +declare half @llvm.maximumnum.f16(half, half) #0 +declare float @llvm.maximumnum.f32(float, float) #0 +declare double @llvm.maximumnum.f64(double, double) #0 +declare <2 x half> @llvm.maximumnum.v2f16(<2 x half>, <2 x half>) #0 declare float @llvm.fma.f32(float, float, float) #0 declare double @llvm.fma.f64(double, double, double) #0 @@ -1486,6 +1494,410 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ret <2 x half> %x } +; ---- minimumnum ---- + +define half @minimumnum_half(half %a, half %b) { +; CHECK-NOF16-LABEL: minimumnum_half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: minimumnum_half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-F16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-F16-NEXT: min.f16 %rs3, %rs1, %rs2; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: minimumnum_half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call half @llvm.minimumnum.f16(half %a, half %b) + ret half %x +} + +define float @minimumnum_float(float %a, float %b) { +; CHECK-LABEL: minimumnum_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_param_1]; +; CHECK-NEXT: min.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.minimumnum.f32(float %a, float %b) + ret float %x +} + +define float @minimumnum_float_ftz(float %a, float %b) #1 { +; CHECK-LABEL: minimumnum_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_ftz_param_1]; +; CHECK-NEXT: min.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.minimumnum.f32(float %a, float %b) + ret float %x +} + +define double @minimumnum_double(double %a, double %b) { +; CHECK-LABEL: minimumnum_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [minimumnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [minimumnum_double_param_1]; +; CHECK-NEXT: min.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %x = call double @llvm.minimumnum.f64(double %a, double %b) + ret double %x +} + +; TODO Improve the "Expand" path for minimumnum vectors on targets where +; f16 is not supported. Ideally it should use two f32 minimumnums first instead of +; fully expanding the minimumnum instruction into compare/select instructions. +define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) { +; CHECK-NOF16-LABEL: minimumnum_v2half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: minimumnum_v2half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [minimumnum_v2half_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [minimumnum_v2half_param_1]; +; CHECK-F16-NEXT: min.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: minimumnum_v2half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %x +} + +; ---- maximumnum ---- + +define half @maximumnum_half(half %a, half %b) { +; CHECK-NOF16-LABEL: maximumnum_half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: maximumnum_half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-F16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-F16-NEXT: max.f16 %rs3, %rs1, %rs2; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: maximumnum_half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call half @llvm.maximumnum.f16(half %a, half %b) + ret half %x +} + +define float @maximumnum_float(float %a, float %b) { +; CHECK-LABEL: maximumnum_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_param_1]; +; CHECK-NEXT: max.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.maximumnum.f32(float %a, float %b) + ret float %x +} + +define float @maximumnum_float_ftz(float %a, float %b) #1 { +; CHECK-LABEL: maximumnum_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_ftz_param_1]; +; CHECK-NEXT: max.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.maximumnum.f32(float %a, float %b) + ret float %x +} + +define double @maximumnum_double(double %a, double %b) { +; CHECK-LABEL: maximumnum_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [maximumnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [maximumnum_double_param_1]; +; CHECK-NEXT: max.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %x = call double @llvm.maximumnum.f64(double %a, double %b) + ret double %x +} + +; TODO Improve the "Expand" path for maximumnum vectors on targets where +; f16 is not supported. Ideally it should use two f32 maximumnums first instead of +; fully expanding the maximumnum instruction into compare/select instructions. +define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) { +; CHECK-NOF16-LABEL: maximumnum_v2half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: maximumnum_v2half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [maximumnum_v2half_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [maximumnum_v2half_param_1]; +; CHECK-F16-NEXT: max.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: maximumnum_v2half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %x +} + ; ---- fma ---- define float @fma_float(float %a, float %b, float %c) {