diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cee593def653c..2b3e912d03f61 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10086,6 +10086,50 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) return Combined; + // fold (xor (smin(x, C), C)) -> select (x < C), xor(x, C), 0 + // fold (xor (smax(x, C), C)) -> select (x > C), xor(x, C), 0 + // fold (xor (umin(x, C), C)) -> select (x < C), xor(x, C), 0 + // fold (xor (umax(x, C), C)) -> select (x > C), xor(x, C), 0 + SDValue Op0; + if (sd_match(N0, m_OneUse(m_AnyOf(m_SMin(m_Value(Op0), m_Specific(N1)), + m_SMax(m_Value(Op0), m_Specific(N1)), + m_UMin(m_Value(Op0), m_Specific(N1)), + m_UMax(m_Value(Op0), m_Specific(N1)))))) { + + if (isa(N1) || + ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) { + // For vectors, only optimize when the constant is zero or all-ones to + // avoid generating more instructions + if (VT.isVector()) { + ConstantSDNode *N1C = isConstOrConstSplat(N1); + if (!N1C || (!N1C->isZero() && !N1C->isAllOnes())) + return SDValue(); + } + + EVT CCVT = getSetCCResultType(VT); + ISD::CondCode CC; + switch (N0.getOpcode()) { + case ISD::SMIN: + CC = ISD::SETLT; + break; + case ISD::SMAX: + CC = ISD::SETGT; + break; + case ISD::UMIN: + CC = ISD::SETULT; + break; + case ISD::UMAX: + CC = ISD::SETUGT; + break; + } + SDValue FN1 = DAG.getFreeze(N1); + SDValue Cmp = DAG.getSetCC(DL, CCVT, Op0, FN1, CC); + SDValue XorXC = DAG.getNode(ISD::XOR, DL, VT, Op0, FN1); + SDValue Zero = DAG.getConstant(0, DL, VT); + return DAG.getSelect(DL, VT, Cmp, XorXC, Zero); + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/xor-smin-smax.ll b/llvm/test/CodeGen/AArch64/xor-smin-smax.ll new file mode 100644 index 0000000000000..904397a23afd1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/xor-smin-smax.ll @@ -0,0 +1,278 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +; Test for DAGCombiner optimization: fold (xor (smin(x, C), C)) -> select (x < C), xor (x, C), 0 + +define i64 @test_smin_neg_one(i64 %a) { +; CHECK-LABEL: test_smin_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: cmn x0, #1 +; CHECK-NEXT: csinv x0, xzr, x0, ge +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 -1) + %retval.0 = xor i64 %1, -1 + ret i64 %retval.0 +} + +define i64 @test_smin_zero(i64 %a) { +; CHECK-LABEL: test_smin_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, x0, asr #63 +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 0) + %retval.0 = xor i64 %1, 0 + ret i64 %retval.0 +} + +define i64 @test_smin_constant(i64 %a) { +; CHECK-LABEL: test_smin_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x0, #0x8 +; CHECK-NEXT: cmp x0, #8 +; CHECK-NEXT: csel x0, x8, xzr, lt +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 8) + %retval.0 = xor i64 %1, 8 + ret i64 %retval.0 +} + +; Test for DAGCombiner optimization: fold (xor (smax(x, C), C)) -> select (x > C), xor (x, C), 0 + +define i64 @test_smax_neg_one(i64 %a) { +; CHECK-LABEL: test_smax_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn x8, x0 +; CHECK-NEXT: bic x0, x8, x0, asr #63 +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.smax.i64(i64 %a, i64 -1) + %retval.0 = xor i64 %1, -1 + ret i64 %retval.0 +} + +define i64 @test_smax_zero(i64 %a) { +; CHECK-LABEL: test_smax_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: bic x0, x0, x0, asr #63 +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.smax.i64(i64 %a, i64 0) + %retval.0 = xor i64 %1, 0 + ret i64 %retval.0 +} + +define i64 @test_smax_constant(i64 %a) { +; CHECK-LABEL: test_smax_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x0, #0x8 +; CHECK-NEXT: cmp x0, #8 +; CHECK-NEXT: csel x0, x8, xzr, gt +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.smax.i64(i64 %a, i64 8) + %retval.0 = xor i64 %1, 8 + ret i64 %retval.0 +} + +define i64 @test_umin_neg_one(i64 %a) { +; CHECK-LABEL: test_umin_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn x0, x0 +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.umin.i64(i64 %a, i64 -1) + %retval.0 = xor i64 %1, -1 + ret i64 %retval.0 +} + +define i64 @test_umin_zero(i64 %a) { +; CHECK-LABEL: test_umin_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.umin.i64(i64 %a, i64 0) + %retval.0 = xor i64 %1, 0 + ret i64 %retval.0 +} + +define i64 @test_umin_constant(i64 %a) { +; CHECK-LABEL: test_umin_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x0, #0x8 +; CHECK-NEXT: cmp x0, #8 +; CHECK-NEXT: csel x0, x8, xzr, lo +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.umin.i64(i64 %a, i64 8) + %retval.0 = xor i64 %1, 8 + ret i64 %retval.0 +} + +define i64 @test_umax_neg_one(i64 %a) { +; CHECK-LABEL: test_umax_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.umax.i64(i64 %a, i64 -1) + %retval.0 = xor i64 %1, -1 + ret i64 %retval.0 +} + +define i64 @test_umax_zero(i64 %a) { +; CHECK-LABEL: test_umax_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.umax.i64(i64 %a, i64 0) + %retval.0 = xor i64 %1, 0 + ret i64 %retval.0 +} + +define i64 @test_umax_constant(i64 %a) { +; CHECK-LABEL: test_umax_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x0, #0x8 +; CHECK-NEXT: cmp x0, #8 +; CHECK-NEXT: csel x0, x8, xzr, hi +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.umax.i64(i64 %a, i64 8) + %retval.0 = xor i64 %1, 8 + ret i64 %retval.0 +} + +; Test vector cases + +define <4 x i32> @test_smin_vector_neg_one(<4 x i32> %a) { +; CHECK-LABEL: test_smin_vector_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: cmgt v1.4s, v1.4s, v0.4s +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_smin_vector_zero(<4 x i32> %a) { +; CHECK-LABEL: test_smin_vector_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_smin_vector_constant(<4 x i32> %a) { +; CHECK-LABEL: test_smin_vector_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #8 +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_smax_vector_neg_one(<4 x i32> %a) { +; CHECK-LABEL: test_smax_vector_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v1.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_smax_vector_zero(<4 x i32> %a) { +; CHECK-LABEL: test_smax_vector_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_smax_vector_constant(<4 x i32> %a) { +; CHECK-LABEL: test_smax_vector_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #8 +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_umin_vector_neg_one(<4 x i32> %a) { +; CHECK-LABEL: test_umin_vector_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_umin_vector_zero(<4 x i32> %a) { +; CHECK-LABEL: test_umin_vector_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_umin_vector_constant(<4 x i32> %a) { +; CHECK-LABEL: test_umin_vector_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #8 +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_umax_vector_neg_one(<4 x i32> %a) { +; CHECK-LABEL: test_umax_vector_neg_one: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_umax_vector_zero(<4 x i32> %a) { +; CHECK-LABEL: test_umax_vector_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +define <4 x i32> @test_umax_vector_constant(<4 x i32> %a) { +; CHECK-LABEL: test_umax_vector_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #8 +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> ) + %retval.0 = xor <4 x i32> %1, + ret <4 x i32> %retval.0 +} + +declare i64 @llvm.smin.i64(i64, i64) +declare i64 @llvm.smax.i64(i64, i64) +declare i64 @llvm.umin.i64(i64, i64) +declare i64 @llvm.umax.i64(i64, i64) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)