Skip to content

Commit f10dc56

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64 - revert NEON yield for fast AEAD implementations
As it turns out, checking the TIF_NEED_RESCHED flag after each iteration results in a significant performance regression (~10%) when running fast algorithms (i.e., ones that use special instructions and operate in the < 4 cycles per byte range) on in-order cores with comparatively slow memory accesses such as the Cortex-A53. Given the speed of these ciphers, and the fact that the page based nature of the AEAD scatterwalk API guarantees that the core NEON transform is never invoked with more than a single page's worth of input, we can estimate the worst case duration of any resulting scheduling blackout: on a 1 GHz Cortex-A53 running with 64k pages, processing a page's worth of input at 4 cycles per byte results in a delay of ~250 us, which is a reasonable upper bound. So let's remove the yield checks from the fused AES-CCM and AES-GCM routines entirely. This reverts commit 7b67ae4 and partially reverts commit 7c50136. Fixes: 7c50136 ("crypto: arm64/aes-ghash - yield NEON after every ...") Fixes: 7b67ae4 ("crypto: arm64/aes-ccm - yield NEON after every ...") Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent 46d8c4b commit f10dc56

File tree

2 files changed

+80
-146
lines changed

2 files changed

+80
-146
lines changed

arch/arm64/crypto/aes-ce-ccm-core.S

Lines changed: 55 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -19,33 +19,24 @@
1919
* u32 *macp, u8 const rk[], u32 rounds);
2020
*/
2121
ENTRY(ce_aes_ccm_auth_data)
22-
frame_push 7
23-
24-
mov x19, x0
25-
mov x20, x1
26-
mov x21, x2
27-
mov x22, x3
28-
mov x23, x4
29-
mov x24, x5
30-
31-
ldr w25, [x22] /* leftover from prev round? */
22+
ldr w8, [x3] /* leftover from prev round? */
3223
ld1 {v0.16b}, [x0] /* load mac */
33-
cbz w25, 1f
34-
sub w25, w25, #16
24+
cbz w8, 1f
25+
sub w8, w8, #16
3526
eor v1.16b, v1.16b, v1.16b
36-
0: ldrb w7, [x20], #1 /* get 1 byte of input */
37-
subs w21, w21, #1
38-
add w25, w25, #1
27+
0: ldrb w7, [x1], #1 /* get 1 byte of input */
28+
subs w2, w2, #1
29+
add w8, w8, #1
3930
ins v1.b[0], w7
4031
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
4132
beq 8f /* out of input? */
42-
cbnz w25, 0b
33+
cbnz w8, 0b
4334
eor v0.16b, v0.16b, v1.16b
44-
1: ld1 {v3.4s}, [x23] /* load first round key */
45-
prfm pldl1strm, [x20]
46-
cmp w24, #12 /* which key size? */
47-
add x6, x23, #16
48-
sub w7, w24, #2 /* modified # of rounds */
35+
1: ld1 {v3.4s}, [x4] /* load first round key */
36+
prfm pldl1strm, [x1]
37+
cmp w5, #12 /* which key size? */
38+
add x6, x4, #16
39+
sub w7, w5, #2 /* modified # of rounds */
4940
bmi 2f
5041
bne 5f
5142
mov v5.16b, v3.16b
@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data)
6455
ld1 {v5.4s}, [x6], #16 /* load next round key */
6556
bpl 3b
6657
aese v0.16b, v4.16b
67-
subs w21, w21, #16 /* last data? */
58+
subs w2, w2, #16 /* last data? */
6859
eor v0.16b, v0.16b, v5.16b /* final round */
6960
bmi 6f
70-
ld1 {v1.16b}, [x20], #16 /* load next input block */
61+
ld1 {v1.16b}, [x1], #16 /* load next input block */
7162
eor v0.16b, v0.16b, v1.16b /* xor with mac */
72-
beq 6f
73-
74-
if_will_cond_yield_neon
75-
st1 {v0.16b}, [x19] /* store mac */
76-
do_cond_yield_neon
77-
ld1 {v0.16b}, [x19] /* reload mac */
78-
endif_yield_neon
79-
80-
b 1b
81-
6: st1 {v0.16b}, [x19] /* store mac */
63+
bne 1b
64+
6: st1 {v0.16b}, [x0] /* store mac */
8265
beq 10f
83-
adds w21, w21, #16
66+
adds w2, w2, #16
8467
beq 10f
85-
mov w25, w21
86-
7: ldrb w7, [x20], #1
68+
mov w8, w2
69+
7: ldrb w7, [x1], #1
8770
umov w6, v0.b[0]
8871
eor w6, w6, w7
89-
strb w6, [x19], #1
90-
subs w21, w21, #1
72+
strb w6, [x0], #1
73+
subs w2, w2, #1
9174
beq 10f
9275
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
9376
b 7b
94-
8: mov w7, w25
95-
add w25, w25, #16
77+
8: mov w7, w8
78+
add w8, w8, #16
9679
9: ext v1.16b, v1.16b, v1.16b, #1
9780
adds w7, w7, #1
9881
bne 9b
9982
eor v0.16b, v0.16b, v1.16b
100-
st1 {v0.16b}, [x19]
101-
10: str w25, [x22]
102-
103-
frame_pop
83+
st1 {v0.16b}, [x0]
84+
10: str w8, [x3]
10485
ret
10586
ENDPROC(ce_aes_ccm_auth_data)
10687

@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final)
145126
ENDPROC(ce_aes_ccm_final)
146127

147128
.macro aes_ccm_do_crypt,enc
148-
frame_push 8
149-
150-
mov x19, x0
151-
mov x20, x1
152-
mov x21, x2
153-
mov x22, x3
154-
mov x23, x4
155-
mov x24, x5
156-
mov x25, x6
157-
158-
ldr x26, [x25, #8] /* load lower ctr */
159-
ld1 {v0.16b}, [x24] /* load mac */
160-
CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
129+
ldr x8, [x6, #8] /* load lower ctr */
130+
ld1 {v0.16b}, [x5] /* load mac */
131+
CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
161132
0: /* outer loop */
162-
ld1 {v1.8b}, [x25] /* load upper ctr */
163-
prfm pldl1strm, [x20]
164-
add x26, x26, #1
165-
rev x9, x26
166-
cmp w23, #12 /* which key size? */
167-
sub w7, w23, #2 /* get modified # of rounds */
133+
ld1 {v1.8b}, [x6] /* load upper ctr */
134+
prfm pldl1strm, [x1]
135+
add x8, x8, #1
136+
rev x9, x8
137+
cmp w4, #12 /* which key size? */
138+
sub w7, w4, #2 /* get modified # of rounds */
168139
ins v1.d[1], x9 /* no carry in lower ctr */
169-
ld1 {v3.4s}, [x22] /* load first round key */
170-
add x10, x22, #16
140+
ld1 {v3.4s}, [x3] /* load first round key */
141+
add x10, x3, #16
171142
bmi 1f
172143
bne 4f
173144
mov v5.16b, v3.16b
@@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
194165
bpl 2b
195166
aese v0.16b, v4.16b
196167
aese v1.16b, v4.16b
197-
subs w21, w21, #16
198-
bmi 7f /* partial block? */
199-
ld1 {v2.16b}, [x20], #16 /* load next input block */
168+
subs w2, w2, #16
169+
bmi 6f /* partial block? */
170+
ld1 {v2.16b}, [x1], #16 /* load next input block */
200171
.if \enc == 1
201172
eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
202173
eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
@@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
205176
eor v1.16b, v2.16b, v5.16b /* final round enc */
206177
.endif
207178
eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
208-
st1 {v1.16b}, [x19], #16 /* write output block */
209-
beq 5f
210-
211-
if_will_cond_yield_neon
212-
st1 {v0.16b}, [x24] /* store mac */
213-
do_cond_yield_neon
214-
ld1 {v0.16b}, [x24] /* reload mac */
215-
endif_yield_neon
216-
217-
b 0b
218-
5:
219-
CPU_LE( rev x26, x26 )
220-
st1 {v0.16b}, [x24] /* store mac */
221-
str x26, [x25, #8] /* store lsb end of ctr (BE) */
222-
223-
6: frame_pop
224-
ret
225-
226-
7: eor v0.16b, v0.16b, v5.16b /* final round mac */
179+
st1 {v1.16b}, [x0], #16 /* write output block */
180+
bne 0b
181+
CPU_LE( rev x8, x8 )
182+
st1 {v0.16b}, [x5] /* store mac */
183+
str x8, [x6, #8] /* store lsb end of ctr (BE) */
184+
5: ret
185+
186+
6: eor v0.16b, v0.16b, v5.16b /* final round mac */
227187
eor v1.16b, v1.16b, v5.16b /* final round enc */
228-
st1 {v0.16b}, [x24] /* store mac */
229-
add w21, w21, #16 /* process partial tail block */
230-
8: ldrb w9, [x20], #1 /* get 1 byte of input */
188+
st1 {v0.16b}, [x5] /* store mac */
189+
add w2, w2, #16 /* process partial tail block */
190+
7: ldrb w9, [x1], #1 /* get 1 byte of input */
231191
umov w6, v1.b[0] /* get top crypted ctr byte */
232192
umov w7, v0.b[0] /* get top mac byte */
233193
.if \enc == 1
@@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 )
237197
eor w9, w9, w6
238198
eor w7, w7, w9
239199
.endif
240-
strb w9, [x19], #1 /* store out byte */
241-
strb w7, [x24], #1 /* store mac byte */
242-
subs w21, w21, #1
243-
beq 6b
200+
strb w9, [x0], #1 /* store out byte */
201+
strb w7, [x5], #1 /* store mac byte */
202+
subs w2, w2, #1
203+
beq 5b
244204
ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
245205
ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
246-
b 8b
206+
b 7b
247207
.endm
248208

249209
/*

arch/arm64/crypto/ghash-ce-core.S

Lines changed: 25 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
322322
.endm
323323

324324
.macro pmull_gcm_do_crypt, enc
325-
frame_push 10
325+
ld1 {SHASH.2d}, [x4]
326+
ld1 {XL.2d}, [x1]
327+
ldr x8, [x5, #8] // load lower counter
326328

327-
mov x19, x0
328-
mov x20, x1
329-
mov x21, x2
330-
mov x22, x3
331-
mov x23, x4
332-
mov x24, x5
333-
mov x25, x6
334-
mov x26, x7
335-
.if \enc == 1
336-
ldr x27, [sp, #96] // first stacked arg
337-
.endif
338-
339-
ldr x28, [x24, #8] // load lower counter
340-
CPU_LE( rev x28, x28 )
341-
342-
0: mov x0, x25
343-
load_round_keys w26, x0
344-
ld1 {SHASH.2d}, [x23]
345-
ld1 {XL.2d}, [x20]
329+
load_round_keys w7, x6
346330

347331
movi MASK.16b, #0xe1
348332
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
333+
CPU_LE( rev x8, x8 )
349334
shl MASK.2d, MASK.2d, #57
350335
eor SHASH2.16b, SHASH2.16b, SHASH.16b
351336

352337
.if \enc == 1
353-
ld1 {KS.16b}, [x27]
338+
ldr x10, [sp]
339+
ld1 {KS.16b}, [x10]
354340
.endif
355341

356-
1: ld1 {CTR.8b}, [x24] // load upper counter
357-
ld1 {INP.16b}, [x22], #16
358-
rev x9, x28
359-
add x28, x28, #1
360-
sub w19, w19, #1
342+
0: ld1 {CTR.8b}, [x5] // load upper counter
343+
ld1 {INP.16b}, [x3], #16
344+
rev x9, x8
345+
add x8, x8, #1
346+
sub w0, w0, #1
361347
ins CTR.d[1], x9 // set lower counter
362348

363349
.if \enc == 1
364350
eor INP.16b, INP.16b, KS.16b // encrypt input
365-
st1 {INP.16b}, [x21], #16
351+
st1 {INP.16b}, [x2], #16
366352
.endif
367353

368354
rev64 T1.16b, INP.16b
369355

370-
cmp w26, #12
371-
b.ge 4f // AES-192/256?
356+
cmp w7, #12
357+
b.ge 2f // AES-192/256?
372358

373-
2: enc_round CTR, v21
359+
1: enc_round CTR, v21
374360

375361
ext T2.16b, XL.16b, XL.16b, #8
376362
ext IN1.16b, T1.16b, T1.16b, #8
@@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 )
425411

426412
.if \enc == 0
427413
eor INP.16b, INP.16b, KS.16b
428-
st1 {INP.16b}, [x21], #16
414+
st1 {INP.16b}, [x2], #16
429415
.endif
430416

431-
cbz w19, 3f
417+
cbnz w0, 0b
432418

433-
if_will_cond_yield_neon
434-
st1 {XL.2d}, [x20]
435-
.if \enc == 1
436-
st1 {KS.16b}, [x27]
437-
.endif
438-
do_cond_yield_neon
439-
b 0b
440-
endif_yield_neon
419+
CPU_LE( rev x8, x8 )
420+
st1 {XL.2d}, [x1]
421+
str x8, [x5, #8] // store lower counter
441422

442-
b 1b
443-
444-
3: st1 {XL.2d}, [x20]
445423
.if \enc == 1
446-
st1 {KS.16b}, [x27]
424+
st1 {KS.16b}, [x10]
447425
.endif
448426

449-
CPU_LE( rev x28, x28 )
450-
str x28, [x24, #8] // store lower counter
451-
452-
frame_pop
453427
ret
454428

455-
4: b.eq 5f // AES-192?
429+
2: b.eq 3f // AES-192?
456430
enc_round CTR, v17
457431
enc_round CTR, v18
458-
5: enc_round CTR, v19
432+
3: enc_round CTR, v19
459433
enc_round CTR, v20
460-
b 2b
434+
b 1b
461435
.endm
462436

463437
/*

0 commit comments

Comments
 (0)