Skip to content

Commit 7b67ae4

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/aes-ccm - yield NEON after every block of input
Avoid excessive scheduling delays under a preemptible kernel by yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent d82f37a commit 7b67ae4

File tree

1 file changed

+95
-55
lines changed

1 file changed

+95
-55
lines changed

arch/arm64/crypto/aes-ce-ccm-core.S

Lines changed: 95 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,33 @@
1919
* u32 *macp, u8 const rk[], u32 rounds);
2020
*/
2121
ENTRY(ce_aes_ccm_auth_data)
22-
ldr w8, [x3] /* leftover from prev round? */
22+
frame_push 7
23+
24+
mov x19, x0
25+
mov x20, x1
26+
mov x21, x2
27+
mov x22, x3
28+
mov x23, x4
29+
mov x24, x5
30+
31+
ldr w25, [x22] /* leftover from prev round? */
2332
ld1 {v0.16b}, [x0] /* load mac */
24-
cbz w8, 1f
25-
sub w8, w8, #16
33+
cbz w25, 1f
34+
sub w25, w25, #16
2635
eor v1.16b, v1.16b, v1.16b
27-
0: ldrb w7, [x1], #1 /* get 1 byte of input */
28-
subs w2, w2, #1
29-
add w8, w8, #1
36+
0: ldrb w7, [x20], #1 /* get 1 byte of input */
37+
subs w21, w21, #1
38+
add w25, w25, #1
3039
ins v1.b[0], w7
3140
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
3241
beq 8f /* out of input? */
33-
cbnz w8, 0b
42+
cbnz w25, 0b
3443
eor v0.16b, v0.16b, v1.16b
35-
1: ld1 {v3.4s}, [x4] /* load first round key */
36-
prfm pldl1strm, [x1]
37-
cmp w5, #12 /* which key size? */
38-
add x6, x4, #16
39-
sub w7, w5, #2 /* modified # of rounds */
44+
1: ld1 {v3.4s}, [x23] /* load first round key */
45+
prfm pldl1strm, [x20]
46+
cmp w24, #12 /* which key size? */
47+
add x6, x23, #16
48+
sub w7, w24, #2 /* modified # of rounds */
4049
bmi 2f
4150
bne 5f
4251
mov v5.16b, v3.16b
@@ -55,33 +64,43 @@ ENTRY(ce_aes_ccm_auth_data)
5564
ld1 {v5.4s}, [x6], #16 /* load next round key */
5665
bpl 3b
5766
aese v0.16b, v4.16b
58-
subs w2, w2, #16 /* last data? */
67+
subs w21, w21, #16 /* last data? */
5968
eor v0.16b, v0.16b, v5.16b /* final round */
6069
bmi 6f
61-
ld1 {v1.16b}, [x1], #16 /* load next input block */
70+
ld1 {v1.16b}, [x20], #16 /* load next input block */
6271
eor v0.16b, v0.16b, v1.16b /* xor with mac */
63-
bne 1b
64-
6: st1 {v0.16b}, [x0] /* store mac */
72+
beq 6f
73+
74+
if_will_cond_yield_neon
75+
st1 {v0.16b}, [x19] /* store mac */
76+
do_cond_yield_neon
77+
ld1 {v0.16b}, [x19] /* reload mac */
78+
endif_yield_neon
79+
80+
b 1b
81+
6: st1 {v0.16b}, [x19] /* store mac */
6582
beq 10f
66-
adds w2, w2, #16
83+
adds w21, w21, #16
6784
beq 10f
68-
mov w8, w2
69-
7: ldrb w7, [x1], #1
85+
mov w25, w21
86+
7: ldrb w7, [x20], #1
7087
umov w6, v0.b[0]
7188
eor w6, w6, w7
72-
strb w6, [x0], #1
73-
subs w2, w2, #1
89+
strb w6, [x19], #1
90+
subs w21, w21, #1
7491
beq 10f
7592
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
7693
b 7b
77-
8: mov w7, w8
78-
add w8, w8, #16
94+
8: mov w7, w25
95+
add w25, w25, #16
7996
9: ext v1.16b, v1.16b, v1.16b, #1
8097
adds w7, w7, #1
8198
bne 9b
8299
eor v0.16b, v0.16b, v1.16b
83-
st1 {v0.16b}, [x0]
84-
10: str w8, [x3]
100+
st1 {v0.16b}, [x19]
101+
10: str w25, [x22]
102+
103+
frame_pop
85104
ret
86105
ENDPROC(ce_aes_ccm_auth_data)
87106

@@ -126,19 +145,29 @@ ENTRY(ce_aes_ccm_final)
126145
ENDPROC(ce_aes_ccm_final)
127146

128147
.macro aes_ccm_do_crypt,enc
129-
ldr x8, [x6, #8] /* load lower ctr */
130-
ld1 {v0.16b}, [x5] /* load mac */
131-
CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
148+
frame_push 8
149+
150+
mov x19, x0
151+
mov x20, x1
152+
mov x21, x2
153+
mov x22, x3
154+
mov x23, x4
155+
mov x24, x5
156+
mov x25, x6
157+
158+
ldr x26, [x25, #8] /* load lower ctr */
159+
ld1 {v0.16b}, [x24] /* load mac */
160+
CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
132161
0: /* outer loop */
133-
ld1 {v1.8b}, [x6] /* load upper ctr */
134-
prfm pldl1strm, [x1]
135-
add x8, x8, #1
136-
rev x9, x8
137-
cmp w4, #12 /* which key size? */
138-
sub w7, w4, #2 /* get modified # of rounds */
162+
ld1 {v1.8b}, [x25] /* load upper ctr */
163+
prfm pldl1strm, [x20]
164+
add x26, x26, #1
165+
rev x9, x26
166+
cmp w23, #12 /* which key size? */
167+
sub w7, w23, #2 /* get modified # of rounds */
139168
ins v1.d[1], x9 /* no carry in lower ctr */
140-
ld1 {v3.4s}, [x3] /* load first round key */
141-
add x10, x3, #16
169+
ld1 {v3.4s}, [x22] /* load first round key */
170+
add x10, x22, #16
142171
bmi 1f
143172
bne 4f
144173
mov v5.16b, v3.16b
@@ -165,9 +194,9 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
165194
bpl 2b
166195
aese v0.16b, v4.16b
167196
aese v1.16b, v4.16b
168-
subs w2, w2, #16
169-
bmi 6f /* partial block? */
170-
ld1 {v2.16b}, [x1], #16 /* load next input block */
197+
subs w21, w21, #16
198+
bmi 7f /* partial block? */
199+
ld1 {v2.16b}, [x20], #16 /* load next input block */
171200
.if \enc == 1
172201
eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
173202
eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
@@ -176,18 +205,29 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
176205
eor v1.16b, v2.16b, v5.16b /* final round enc */
177206
.endif
178207
eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
179-
st1 {v1.16b}, [x0], #16 /* write output block */
180-
bne 0b
181-
CPU_LE( rev x8, x8 )
182-
st1 {v0.16b}, [x5] /* store mac */
183-
str x8, [x6, #8] /* store lsb end of ctr (BE) */
184-
5: ret
185-
186-
6: eor v0.16b, v0.16b, v5.16b /* final round mac */
208+
st1 {v1.16b}, [x19], #16 /* write output block */
209+
beq 5f
210+
211+
if_will_cond_yield_neon
212+
st1 {v0.16b}, [x24] /* store mac */
213+
do_cond_yield_neon
214+
ld1 {v0.16b}, [x24] /* reload mac */
215+
endif_yield_neon
216+
217+
b 0b
218+
5:
219+
CPU_LE( rev x26, x26 )
220+
st1 {v0.16b}, [x24] /* store mac */
221+
str x26, [x25, #8] /* store lsb end of ctr (BE) */
222+
223+
6: frame_pop
224+
ret
225+
226+
7: eor v0.16b, v0.16b, v5.16b /* final round mac */
187227
eor v1.16b, v1.16b, v5.16b /* final round enc */
188-
st1 {v0.16b}, [x5] /* store mac */
189-
add w2, w2, #16 /* process partial tail block */
190-
7: ldrb w9, [x1], #1 /* get 1 byte of input */
228+
st1 {v0.16b}, [x24] /* store mac */
229+
add w21, w21, #16 /* process partial tail block */
230+
8: ldrb w9, [x20], #1 /* get 1 byte of input */
191231
umov w6, v1.b[0] /* get top crypted ctr byte */
192232
umov w7, v0.b[0] /* get top mac byte */
193233
.if \enc == 1
@@ -197,13 +237,13 @@ CPU_LE( rev x8, x8 )
197237
eor w9, w9, w6
198238
eor w7, w7, w9
199239
.endif
200-
strb w9, [x0], #1 /* store out byte */
201-
strb w7, [x5], #1 /* store mac byte */
202-
subs w2, w2, #1
203-
beq 5b
240+
strb w9, [x19], #1 /* store out byte */
241+
strb w7, [x24], #1 /* store mac byte */
242+
subs w21, w21, #1
243+
beq 6b
204244
ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
205245
ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
206-
b 7b
246+
b 8b
207247
.endm
208248

209249
/*

0 commit comments

Comments
 (0)