Skip to content

Commit 4a6ae60

Browse files
committed
Merging r339260:
------------------------------------------------------------------------ r339260 | syzaara | 2018-08-08 08:20:43 -0700 (Wed, 08 Aug 2018) | 13 lines [PowerPC] Improve codegen for vector loads using scalar_to_vector This patch aims to improve the codegen for vector loads involving the scalar_to_vector (load X) sequence. Initially, ld->mv instructions were used for scalar_to_vector (load X), so this patch allows scalar_to_vector (load X) to utilize: LXSD and LXSDX for i64 and f64 LXSIWAX for i32 (sign extension to i64) LXSIWZX for i32 and f64 Committing on behalf of Amy Kwan. Differential Revision: https://reviews.llvm.org/D48950 ------------------------------------------------------------------------ llvm-svn: 347957
1 parent d6ffc0c commit 4a6ae60

15 files changed

+1529
-242
lines changed

llvm/lib/Target/PowerPC/P9InstrResources.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,7 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
592592
XXPERM,
593593
XXPERMR,
594594
XXSLDWI,
595+
XXSLDWIs,
595596
XXSPLTIB,
596597
XXSPLTW,
597598
XXSPLTWs,

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8454,17 +8454,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
84548454
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
84558455
int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
84568456

8457-
// If the source for the shuffle is a scalar_to_vector that came from a
8458-
// 32-bit load, it will have used LXVWSX so we don't need to splat again.
8459-
if (Subtarget.hasP9Vector() &&
8460-
((isLittleEndian && SplatIdx == 3) ||
8461-
(!isLittleEndian && SplatIdx == 0))) {
8462-
SDValue Src = V1.getOperand(0);
8463-
if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
8464-
Src.getOperand(0).getOpcode() == ISD::LOAD &&
8465-
Src.getOperand(0).hasOneUse())
8466-
return V1;
8467-
}
84688457
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
84698458
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
84708459
DAG.getConstant(SplatIdx, dl, MVT::i32));

llvm/lib/Target/PowerPC/PPCInstrVSX.td

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,12 @@ let Uses = [RM] in {
877877
"xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm,
878878
[(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB,
879879
imm32SExt16:$SHW))]>;
880+
881+
let isCodeGenOnly = 1 in
882+
def XXSLDWIs : XX3Form_2s<60, 2,
883+
(outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$SHW),
884+
"xxsldwi $XT, $XA, $XA, $SHW", IIC_VecPerm, []>;
885+
880886
def XXSPLTW : XX2Form_2<60, 164,
881887
(outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
882888
"xxspltw $XT, $XB, $UIM", IIC_VecPerm,
@@ -886,6 +892,7 @@ let Uses = [RM] in {
886892
def XXSPLTWs : XX2Form_2<60, 164,
887893
(outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM),
888894
"xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
895+
889896
} // hasSideEffects
890897
} // UseVSXReg = 1
891898

@@ -1466,8 +1473,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
14661473
(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
14671474
(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
14681475
}
1469-
def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
1470-
(v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>;
14711476

14721477
// Instructions for converting float to i64 feeding a store.
14731478
let Predicates = [NoP9Vector] in {
@@ -3050,13 +3055,47 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
30503055
(STXVX $rS, xoaddr:$dst)>;
30513056
def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
30523057
(STXVX $rS, xoaddr:$dst)>;
3053-
def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
3054-
(v4i32 (LXVWSX xoaddr:$src))>;
3055-
def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
3056-
(v4f32 (LXVWSX xoaddr:$src))>;
3057-
def : Pat<(v4f32 (scalar_to_vector
3058-
(f32 (fpround (f64 (extloadf32 xoaddr:$src)))))),
3059-
(v4f32 (LXVWSX xoaddr:$src))>;
3058+
3059+
let AddedComplexity = 400 in {
3060+
// LIWAX - This instruction is used for sign extending i32 -> i64.
3061+
// LIWZX - This instruction will be emitted for i32, f32, and when
3062+
// zero-extending i32 to i64 (zext i32 -> i64).
3063+
let Predicates = [IsLittleEndian] in {
3064+
3065+
def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
3066+
(v2i64 (XXPERMDIs
3067+
(COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC), 2))>;
3068+
3069+
def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
3070+
(v2i64 (XXPERMDIs
3071+
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
3072+
3073+
def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
3074+
(v4i32 (XXPERMDIs
3075+
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
3076+
3077+
def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
3078+
(v4f32 (XXPERMDIs
3079+
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
3080+
}
3081+
3082+
let Predicates = [IsBigEndian] in {
3083+
def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
3084+
(v2i64 (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC))>;
3085+
3086+
def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
3087+
(v2i64 (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC))>;
3088+
3089+
def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
3090+
(v4i32 (XXSLDWIs
3091+
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
3092+
3093+
def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
3094+
(v4f32 (XXSLDWIs
3095+
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
3096+
}
3097+
3098+
}
30603099

30613100
// Build vectors from i8 loads
30623101
def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)),
@@ -3218,6 +3257,39 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
32183257
def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
32193258
(f32 (DFLOADf32 ixaddr:$src))>;
32203259

3260+
3261+
let AddedComplexity = 400 in {
3262+
// The following pseudoinstructions are used to ensure the utilization
3263+
// of all 64 VSX registers.
3264+
let Predicates = [IsLittleEndian, HasP9Vector] in {
3265+
def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))),
3266+
(v2i64 (XXPERMDIs
3267+
(COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>;
3268+
def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))),
3269+
(v2i64 (XXPERMDIs
3270+
(COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>;
3271+
3272+
def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))),
3273+
(v2f64 (XXPERMDIs
3274+
(COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>;
3275+
def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))),
3276+
(v2f64 (XXPERMDIs
3277+
(COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>;
3278+
}
3279+
3280+
let Predicates = [IsBigEndian, HasP9Vector] in {
3281+
def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))),
3282+
(v2i64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>;
3283+
def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))),
3284+
(v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>;
3285+
3286+
def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))),
3287+
(v2f64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>;
3288+
def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))),
3289+
(v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>;
3290+
}
3291+
}
3292+
32213293
let Predicates = [IsBigEndian, HasP9Vector] in {
32223294

32233295
// (Un)Signed DWord vector extract -> QP
@@ -3932,3 +4004,4 @@ let AddedComplexity = 400 in {
39324004
(v4i32 (VEXTSH2W $A))>;
39334005
}
39344006
}
4007+

llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,46 @@
11
; RUN: llc < %s -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \
2-
; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-P8
2+
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \
3+
; RUN: | FileCheck %s --check-prefix=CHECK-P8
34
; RUN: llc < %s -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
4-
; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-P9
5+
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \
6+
; RUN: | FileCheck %s --check-prefix=CHECK-P9
57

68
@a = external local_unnamed_addr global <4 x i32>, align 16
79
@pb = external local_unnamed_addr global float*, align 8
810

911
define void @testExpandPostRAPseudo(i32* nocapture readonly %ptr) {
10-
; CHECK-P8-LABEL: testExpandPostRAPseudo:
11-
; CHECK-P8: lxsiwax 34, 0, 3
12-
; CHECK-P8-NEXT: xxspltw 34, 34, 1
13-
; CHECK-P8-NEXT: stvx 2, 0, 4
14-
; CHECK-P8: #APP
15-
; CHECK-P8-NEXT: #Clobber Rigisters
16-
; CHECK-P8-NEXT: #NO_APP
17-
; CHECK-P8-NEXT: lis 4, 1024
18-
; CHECK-P8-NEXT: lfiwax 0, 0, 3
19-
; CHECK-P8: stfsx 0, 3, 4
20-
; CHECK-P8-NEXT: blr
21-
22-
; CHECK-P9-LABEL: testExpandPostRAPseudo:
23-
; CHECK-P9: lxvwsx 0, 0, 3
24-
; CHECK-P9: stxvx 0, 0, 4
25-
; CHECK-P9: #APP
26-
; CHECK-P9-NEXT: #Clobber Rigisters
27-
; CHECK-P9-NEXT: #NO_APP
28-
; CHECK-P9-NEXT: lis 4, 1024
29-
; CHECK-P9-NEXT: lfiwax 0, 0, 3
30-
; CHECK-P9: stfsx 0, 3, 4
31-
; CHECK-P9-NEXT: blr
32-
12+
; CHECK-P8-LABEL: testExpandPostRAPseudo:
13+
; CHECK-P8: # %bb.0: # %entry
14+
; CHECK-P8: lfiwzx f0, 0, r3
15+
; CHECK-P8: ld r4, .LC0@toc@l(r4)
16+
; CHECK-P8: xxpermdi vs0, f0, f0, 2
17+
; CHECK-P8: xxspltw v2, vs0, 3
18+
; CHECK-P8: stvx v2, 0, r4
19+
; CHECK-P8: lis r4, 1024
20+
; CHECK-P8: lfiwax f0, 0, r3
21+
; CHECK-P8: addis r3, r2, .LC1@toc@ha
22+
; CHECK-P8: ld r3, .LC1@toc@l(r3)
23+
; CHECK-P8: xscvsxdsp f0, f0
24+
; CHECK-P8: ld r3, 0(r3)
25+
; CHECK-P8: stfsx f0, r3, r4
26+
; CHECK-P8: blr
27+
;
28+
; CHECK-P9-LABEL: testExpandPostRAPseudo:
29+
; CHECK-P9: # %bb.0: # %entry
30+
; CHECK-P9: lfiwzx f0, 0, r3
31+
; CHECK-P9: addis r4, r2, .LC0@toc@ha
32+
; CHECK-P9: ld r4, .LC0@toc@l(r4)
33+
; CHECK-P9: xxpermdi vs0, f0, f0, 2
34+
; CHECK-P9: xxspltw vs0, vs0, 3
35+
; CHECK-P9: stxvx vs0, 0, r4
36+
; CHECK-P9: lis r4, 1024
37+
; CHECK-P9: lfiwax f0, 0, r3
38+
; CHECK-P9: addis r3, r2, .LC1@toc@ha
39+
; CHECK-P9: ld r3, .LC1@toc@l(r3)
40+
; CHECK-P9: xscvsxdsp f0, f0
41+
; CHECK-P9: ld r3, 0(r3)
42+
; CHECK-P9: stfsx f0, r3, r4
43+
; CHECK-P9: blr
3344
entry:
3445
%0 = load i32, i32* %ptr, align 4
3546
%splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0

llvm/test/CodeGen/PowerPC/build-vector-tests.ll

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@
109109
;vector int spltRegVali(int val) { //
110110
; return (vector int) val; //
111111
;} //
112-
;// P8: lxsiwax, xxspltw //
113-
;// P9: lxvwsx //
112+
;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
113+
;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
114114
;vector int spltMemVali(int *ptr) { //
115115
; return (vector int)*ptr; //
116116
;} //
@@ -284,8 +284,8 @@
284284
;vector unsigned int spltRegValui(unsigned int val) { //
285285
; return (vector unsigned int) val; //
286286
;} //
287-
;// P8: lxsiwax, xxspltw //
288-
;// P9: lxvwsx //
287+
;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
288+
;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
289289
;vector unsigned int spltMemValui(unsigned int *ptr) { //
290290
; return (vector unsigned int)*ptr; //
291291
;} //
@@ -1202,15 +1202,21 @@ entry:
12021202
; P9LE-LABEL: spltMemVali
12031203
; P8BE-LABEL: spltMemVali
12041204
; P8LE-LABEL: spltMemVali
1205-
; P9BE: lxvwsx v2, 0, r3
1205+
; P9BE: lfiwzx f0, 0, r3
1206+
; P9BE: xxsldwi vs0, f0, f0, 1
1207+
; P9BE: xxspltw v2, vs0, 0
12061208
; P9BE: blr
1207-
; P9LE: lxvwsx v2, 0, r3
1209+
; P9LE: lfiwzx f0, 0, r3
1210+
; P9LE: xxpermdi vs0, f0, f0, 2
1211+
; P9LE: xxspltw v2, vs0, 3
12081212
; P9LE: blr
1209-
; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3
1210-
; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
1213+
; P8BE: lfiwzx f0, 0, r3
1214+
; P8BE: xxsldwi vs0, f0, f0, 1
1215+
; P8BE: xxspltw v2, vs0, 0
12111216
; P8BE: blr
1212-
; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3
1213-
; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
1217+
; P8LE: lfiwzx f0, 0, r3
1218+
; P8LE: xxpermdi vs0, f0, f0, 2
1219+
; P8LE: xxspltw v2, vs0, 3
12141220
; P8LE: blr
12151221
}
12161222

@@ -2338,15 +2344,21 @@ entry:
23382344
; P9LE-LABEL: spltMemValui
23392345
; P8BE-LABEL: spltMemValui
23402346
; P8LE-LABEL: spltMemValui
2341-
; P9BE: lxvwsx v2, 0, r3
2347+
; P9BE: lfiwzx f0, 0, r3
2348+
; P9BE: xxsldwi vs0, f0, f0, 1
2349+
; P9BE: xxspltw v2, vs0, 0
23422350
; P9BE: blr
2343-
; P9LE: lxvwsx v2, 0, r3
2351+
; P9LE: lfiwzx f0, 0, r3
2352+
; P9LE: xxpermdi vs0, f0, f0, 2
2353+
; P9LE: xxspltw v2, vs0, 3
23442354
; P9LE: blr
2345-
; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3
2346-
; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
2355+
; P8BE: lfiwzx f0, 0, r3
2356+
; P8BE: xxsldwi vs0, f0, f0, 1
2357+
; P8BE: xxspltw v2, vs0, 0
23472358
; P8BE: blr
2348-
; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3
2349-
; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
2359+
; P8LE: lfiwzx f0, 0, r3
2360+
; P8LE: xxpermdi vs0, f0, f0, 2
2361+
; P8LE: xxspltw v2, vs0, 3
23502362
; P8LE: blr
23512363
}
23522364

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,27 @@
1-
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \
1+
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s \
2+
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-LE \
23
; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s
3-
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck \
4+
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s \
5+
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck \
46
; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s
57

68
define <16 x i8> @test(i32* %s, i32* %t) {
9+
; CHECK-LE-LABEL: test:
10+
; CHECK-LE: # %bb.0: # %entry
11+
; CHECK-LE-NEXT: lfiwzx f0, 0, r3
12+
; CHECK-LE-NEXT: xxpermdi vs0, f0, f0, 2
13+
; CHECK-LE-NEXT: xxspltw v2, vs0, 3
14+
; CHECK-LE-NEXT: blr
15+
16+
; CHECK-LABEL: test:
17+
; CHECK: # %bb.0: # %entry
18+
; CHECK-NEXT: lfiwzx f0, 0, r3
19+
; CHECK-NEXT: xxsldwi vs0, f0, f0, 1
20+
; CHECK-NEXT: xxspltw v2, vs0, 0
21+
; CHECK-NEXT: blr
722
entry:
823
%0 = bitcast i32* %s to <4 x i8>*
924
%1 = load <4 x i8>, <4 x i8>* %0, align 4
1025
%2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1126
ret <16 x i8> %2
12-
; CHECK-LABEL: test
13-
; CHECK: lxsiwax 34, 0, 3
14-
; CHECK: xxspltw 34, 34, 1
1527
}

0 commit comments

Comments
 (0)