-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[PowerPC] Optimize not equal compares against zero vectors #150422
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-powerpc Author: None (Himadhith) ChangesThis patch is for special cases involving 0 vectors. During the comparison of vector operands, current code generation checks with This means that for the special case, instead of using As a result the negation is avoided since the only condition where this will be false is for 0 as it is an Full diff: https://github.com/llvm/llvm-project/pull/150422.diff 2 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 415164fc9e2cb..9d4289d241cd1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4569,7 +4569,14 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
if (!IsStrict && LHS.getValueType().isVector()) {
if (Subtarget->hasSPE())
return false;
-
+ // Check if RHS or LHS vector operands are 0 and change SETNE to either
+ // SETUGT or SETULT.
+ if (CC == ISD::SETNE) {
+ if (ISD::isBuildVectorAllZeros(RHS.getNode()))
+ CC = ISD::SETUGT;
+ else if (ISD::isBuildVectorAllZeros(LHS.getNode()))
+ CC = ISD::SETULT;
+ }
EVT VecVT = LHS.getValueType();
bool Swap, Negate;
unsigned int VCmpInst =
diff --git a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
index 59173e22edf26..27fe863f01438 100644
--- a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
@@ -19,23 +19,21 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) {
; POWERPC_64LE: .LBB0_6: # %vector.body
; POWERPC_64LE-NEXT: #
; POWERPC_64LE-NEXT: lxv [[R1:[0-9]+]], -64(4)
-; POWERPC_64LE-NEXT: vcmpequh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
-; POWERPC_64LE-NEXT: xxlnor [[R1]], [[R1]], [[R1]]
-; POWERPC_64LE-NEXT: vmrghh [[R4:[0-9]+]], [[R2]], [[R2]]
-; POWERPC_64LE-NEXT: vmrglh [[R2]], [[R2]], [[R2]]
-; POWERPC_64LE-NEXT: xxland [[R5:[0-9]+]], [[R5]], [[R6:[0-9]+]]
-; POWERPC_64LE-NEXT: xxland [[R1]], [[R1]], [[R6]]
-; POWERPC_64LE-NEXT: vadduwm [[R7:[0-9]+]], [[R7]], [[R4]]
+; POWERPC_64LE-NEXT: vcmpgtuh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
+; POWERPC_64LE-NEXT: vmrglh [[R4:[0-9]+]], [[R2]], [[R2]]
+; POWERPC_64LE-NEXT: vmrghh [[R2]], [[R2]], [[R2]]
+; POWERPC_64LE-NEXT: xxland [[R1]], [[R1]], [[R5:[0-9]+]]
+; POWERPC_64LE-NEXT: xxland [[R6:[0-9]+]], [[R6]], [[R5]]
+; POWERPC_64LE-NEXT: vadduwm [[R7:[0-9]+]], [[R7]], [[R2]]
; POWERPC_64LE: .LBB0_10: # %vec.epilog.vector.body
; POWERPC_64LE-NEXT: #
; POWERPC_64LE-NEXT: lxv [[R8:[0-9]+]], 0(4)
; POWERPC_64LE-NEXT: addi 4, 4, 16
-; POWERPC_64LE-NEXT: vcmpequh [[R9:[0-9]+]], [[R9]], [[R10:[0-9]+]]
-; POWERPC_64LE-NEXT: xxlnor [[R8]], [[R8]], [[R8]]
+; POWERPC_64LE-NEXT: vcmpgtuh [[R9:[0-9]+]], [[R9]], [[R10:[0-9]+]]
; POWERPC_64LE-NEXT: vmrglh [[R11:[0-9]+]], [[R9]], [[R9]]
; POWERPC_64LE-NEXT: vmrghh [[R9]], [[R9]], [[R9]]
-; POWERPC_64LE-NEXT: xxland [[R12:[0-9]+]], [[R12]], [[R6]]
-; POWERPC_64LE-NEXT: xxland [[R8]], [[R8]], [[R6]]
+; POWERPC_64LE-NEXT: xxland [[R12:[0-9]+]], [[R12]], [[R5]]
+; POWERPC_64LE-NEXT: xxland [[R8]], [[R8]], [[R5]]
; POWERPC_64LE-NEXT: vadduwm [[R7]], [[R7]], [[R9]]
; POWERPC_64LE-NEXT: vadduwm [[R3]], [[R3]], [[R11]]
; POWERPC_64LE-NEXT: bdnz .LBB0_10
@@ -45,19 +43,17 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) {
; POWERPC_64: L..BB0_6: # %vector.body
; POWERPC_64-NEXT: #
; POWERPC_64-NEXT: lxv [[R1:[0-9]+]], -64(4)
-; POWERPC_64-NEXT: vcmpequh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
-; POWERPC_64-NEXT: xxlnor [[R1]], [[R1]], [[R1]]
-; POWERPC_64-NEXT: vmrglh [[R4:[0-9]+]], [[R2]], [[R2]]
-; POWERPC_64-NEXT: vmrghh [[R2]], [[R2]], [[R2]]
+; POWERPC_64-NEXT: vcmpgtuh [[R2:[0-9]+]], [[R18:[0-9]+]], [[R3:[0-9]+]]
+; POWERPC_64-NEXT: vmrghh [[R4:[0-9]+]], [[R2]], [[R2]]
+; POWERPC_64-NEXT: vmrglh [[R2]], [[R2]], [[R2]]
; POWERPC_64-NEXT: xxland [[R5:[0-9]+]], [[R5]], [[R6:[0-9]+]]
; POWERPC_64-NEXT: xxland [[R1]], [[R1]], [[R6]]
-; POWERPC_64-NEXT: vadduwm [[R7:[0-9]+]], [[R7]], [[R4]]
+; POWERPC_64-NEXT: vadduwm [[R7:[0-9]+]], [[R7]], [[R2]]
; POWERPC_64: L..BB0_10: # %vec.epilog.vector.body
; POWERPC_64-NEXT: #
; POWERPC_64-NEXT: lxv [[R8:[0-9]+]], 0(4)
; POWERPC_64-NEXT: addi 4, 4, 16
-; POWERPC_64-NEXT: vcmpequh [[R9:[0-9]+]], [[R9]], [[R10:[0-9]+]]
-; POWERPC_64-NEXT: xxlnor [[R8]], [[R8]], [[R8]]
+; POWERPC_64-NEXT: vcmpgtuh [[R9:[0-9]+]], [[R19:[0-9]+]], [[R10:[0-9]+]]
; POWERPC_64-NEXT: vmrghh [[R11:[0-9]+]], [[R9]], [[R9]]
; POWERPC_64-NEXT: vmrglh [[R9]], [[R9]], [[R9]]
; POWERPC_64-NEXT: xxland [[R12:[0-9]+]], [[R12]], [[R6]]
@@ -70,28 +66,26 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) {
; POWERPC_32-LABEL: test_Greater_than:
; POWERPC_32: L..BB0_7: # %vector.body
; POWERPC_32-NEXT: #
-; POWERPC_32-NEXT: lxv [[R1:[0-9]+]], 0(10)
-; POWERPC_32-NEXT: addic [[R13:[0-9]+]], [[R13]], 64
-; POWERPC_32-NEXT: addze [[R14:[0-9]+]], [[R14]]
-; POWERPC_32-NEXT: xor [[R15:[0-9]+]], [[R13]], [[R16:[0-9]+]]
-; POWERPC_32-NEXT: or. [[R15]], [[R15]], [[R14]]
-; POWERPC_32-NEXT: vcmpequh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
-; POWERPC_32-NEXT: xxlnor [[R1]], [[R1]], [[R1]]
+; POWERPC_32-NEXT: lxv [[R1:[0-9]+]], 0([[R13:[0-9]+]])
+; POWERPC_32-NEXT: addic [[R14:[0-9]+]], [[R14]], 64
+; POWERPC_32-NEXT: addze [[R7:[0-9]+]], [[R7]]
+; POWERPC_32-NEXT: xor [[R15:[0-9]+]], [[R14]], [[R16:[0-9]+]]
+; POWERPC_32-NEXT: or. [[R15]], [[R15]], [[R7]]
+; POWERPC_32-NEXT: vcmpgtuh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
; POWERPC_32-NEXT: vmrglh [[R4:[0-9]+]], [[R2]], [[R2]]
; POWERPC_32-NEXT: vmrghh [[R2]], [[R2]], [[R2]]
; POWERPC_32-NEXT: xxland [[R5:[0-9]+]], [[R5]], [[R6:[0-9]+]]
; POWERPC_32-NEXT: xxland [[R1]], [[R1]], [[R6]]
-; POWERPC_32-NEXT: vadduwm [[R7:[0-9]+]], [[R7]], [[R4]]
+; POWERPC_32-NEXT: vadduwm [[R7]], [[R7]], [[R4]]
; POWERPC_32: L..BB0_11: # %vec.epilog.vector.body
; POWERPC_32-NEXT: #
-; POWERPC_32-NEXT: slwi [[R14]], [[R13]], 1
-; POWERPC_32-NEXT: addic [[R13]], [[R13]], 8
+; POWERPC_32-NEXT: slwi [[R7]], [[R14]], 1
+; POWERPC_32-NEXT: addic [[R14]], [[R14]], 8
; POWERPC_32-NEXT: addze [[R17:[0-9]+]], [[R17]]
-; POWERPC_32-NEXT: lxvx [[R8:[0-9]+]], [[R18:[0-9]+]], [[R14]]
-; POWERPC_32-NEXT: xor [[R14]], [[R13]], [[R16]]
-; POWERPC_32-NEXT: or. [[R14]], [[R14]], [[R17]]
-; POWERPC_32-NEXT: vcmpequh [[R9:[0-9]+]], [[R9]], [[R3]]
-; POWERPC_32-NEXT: xxlnor [[R8]], [[R8]], [[R8]]
+; POWERPC_32-NEXT: lxvx [[R8:[0-9]+]], [[R18:[0-9]+]], [[R7]]
+; POWERPC_32-NEXT: xor [[R7]], [[R14]], [[R16]]
+; POWERPC_32-NEXT: or. [[R7]], [[R7]], [[R17]]
+; POWERPC_32-NEXT: vcmpgtuh [[R9:[0-9]+]], [[R9]], [[R3]]
; POWERPC_32-NEXT: vmrghh [[R11:[0-9]+]], [[R9]], [[R9]]
; POWERPC_32-NEXT: vmrglh [[R9]], [[R9]], [[R9]]
; POWERPC_32-NEXT: xxland [[R12:[0-9]+]], [[R12]], [[R6]]
|
NFC PR related to this patch: [NFC][PowerPC] Add test case for lockdown of vector compare greater than support for Zero vector comparisons. |
84e4923
to
24a2fb1
Compare
NFC PR related to this patch: [NFC][PowerPC] Cleaning up test file and removing redundant front-end test |
@@ -1031,7 +1031,7 @@ define <4 x float> @hoo3_fmf(<4 x float> %a) #1 { | |||
; CHECK-P7-NEXT: vmaddfp 5, 5, 0, 3 | |||
; CHECK-P7-NEXT: vmaddfp 3, 5, 4, 3 | |||
; CHECK-P7-NEXT: vxor 4, 4, 4 | |||
; CHECK-P7-NEXT: vcmpeqfp 2, 2, 4 | |||
; CHECK-P7-NEXT: vcmpgefp 2, 4, 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are these equivalent instructions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, Vector Compare Equal Floating-Point VC-form (vcmpeqfp)
is replaced with Vector Compare Greater Than or Equal Floating-PointVC-form (vcmpgefp)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This look a bit suspicious to me. This patch is supposed to replace not equal
compares (vcmpeqfp+vnot) against zero vectors with either a greater then
or less then
instruction. However, here it is replacing a vcmpeqfp
directly with vcmpgefp
. Maybe this is due to a missing check for a 1 use condition for SETNE?
a312c23
to
b07da25
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nit.
b07da25
to
3bfec50
Compare
Gentle ping @amy-kwan @RolandF77 @AditiRM @lei137 |
c9800cd
to
2d6ecf5
Compare
…sons vector compare greater than support for Zero vector comparisons
// Optimise 'Not equal to zero-vector' comparisons using 'Greater than or | ||
// less than' operators. Example: Consider k to be any non-zero positive | ||
// value. | ||
// for k != 0, change SETNE to SETUGT (k > 0) | ||
// for 0 != k, change SETNE to SETULT (0 < k) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Optimise 'Not equal to zero-vector' comparisons using 'Greater than or | |
// less than' operators. Example: Consider k to be any non-zero positive | |
// value. | |
// for k != 0, change SETNE to SETUGT (k > 0) | |
// for 0 != k, change SETNE to SETULT (0 < k) | |
// Optimize 'Not equal to zero-vector' comparisons to 'Greater than or | |
// less than' operators. | |
// Example: Consider k to be any non-zero positive value | |
// * for k != 0, change SETNE to SETUGT (k > 0) | |
// * for 0 != k, change SETNE to SETULT (0 < k) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -1,26 +1,24 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ | |||
; RUN: < %s | FileCheck %s --check-prefix=POWERPC_64LE | |||
; RUN: < %s | FileCheck %s --check-prefix=POWERPC_64LE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: continuation of a RUN line should always be indented. Same comment for all the run line changes below.
; RUN: < %s | FileCheck %s --check-prefix=POWERPC_64LE | |
; RUN: < %s | FileCheck %s --check-prefix=POWERPC_64LE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do it once the NFC patch is merged.
|
||
define i32 @test_Greater_than(ptr %colauths) { | ||
; This testcase is for the special case of zero-vector comparisons. | ||
; Currently the generated code does a comparison (vcmpequh) and then a negation (xxlnor). | ||
; This pattern is expected to be optimized in a future patch. | ||
; Optimised version using vcmpgtuh. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; Optimised version using vcmpgtuh. | |
; Optimize zero-vector `vcmpequh` compares followed by negate to `vcmpgtuh`. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -1,23 +1,7 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ | |||
; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s | |||
define dso_local <16 x i8> @ConvertExtractedMaskBitsToVect(<16 x i8> noundef %0) local_unnamed_addr #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove local_unnamed_addr #0
. Since I don't see any attributes defined here, it's likely not needed for this behaviour?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that makes sense, removing local_unnamed_addr #0
.
@@ -11967,9 +11958,8 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) { | |||
; PWR7-NEXT: addis 3, 2, .LCPI100_0@toc@ha | |||
; PWR7-NEXT: addi 3, 3, .LCPI100_0@toc@l | |||
; PWR7-NEXT: xxland 34, 34, 0 | |||
; PWR7-NEXT: vcmpequw 2, 2, 3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the relationships here would be easier to see if these tests generates the full register naming so it's obvious 2 & 34 are the same registers. Please do an NFC update for these tests to add the options -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr
to the run lines.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we please update the title of this PR to be more accurate?
I think this PR is not just replacing the vector compare equal with gt compare instruction.
Maybe some thing like this is better:
[PowerPC] Optimize not equal compares against zero vectors
This patch is for special cases involving 0 vectors. During the comparison of vector operands, current code generation checks with
vcmpequh (vector compare equal unsigned halfword)
followed by a negationxxlnor (VSX Vector Logical NOR XX3-form)
.This means that for the special case, instead of using
vcmpequh
and then negating the result, we can directly usevcmpgtuh (vector compare greater than unsigned halfword)
.As a result the negation is avoided since the only condition where this will be false is for 0 as it is an
unsigned halfword
.