Skip to content

Commit f2ca1cb

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/chacha - optimize for arbitrary length inputs
Update the 4-way NEON ChaCha routine so it can handle input of any length >64 bytes in its entirety, rather than having to call into the 1-way routine and/or memcpy()s via temp buffers to handle the tail of a ChaCha invocation that is not a multiple of 256 bytes. On inputs that are a multiple of 256 bytes (and thus in tcrypt benchmarks), performance drops by around 1% on Cortex-A57, while performance for inputs drawn randomly from the range [64, 1024) increases by around 30%. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent ee5bbc9 commit f2ca1cb

File tree

2 files changed

+184
-37
lines changed

2 files changed

+184
-37
lines changed

arch/arm64/crypto/chacha-neon-core.S

Lines changed: 170 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
*/
2020

2121
#include <linux/linkage.h>
22+
#include <asm/assembler.h>
23+
#include <asm/cache.h>
2224

2325
.text
2426
.align 6
@@ -36,7 +38,7 @@
3638
*/
3739
chacha_permute:
3840

39-
adr x10, ROT8
41+
adr_l x10, ROT8
4042
ld1 {v12.4s}, [x10]
4143

4244
.Ldoubleround:
@@ -169,6 +171,12 @@ ENTRY(chacha_4block_xor_neon)
169171
// x1: 4 data blocks output, o
170172
// x2: 4 data blocks input, i
171173
// w3: nrounds
174+
// x4: byte count
175+
176+
adr_l x10, .Lpermute
177+
and x5, x4, #63
178+
add x10, x10, x5
179+
add x11, x10, #64
172180

173181
//
174182
// This function encrypts four consecutive ChaCha blocks by loading
@@ -178,15 +186,15 @@ ENTRY(chacha_4block_xor_neon)
178186
// matrix by interleaving 32- and then 64-bit words, which allows us to
179187
// do XOR in NEON registers.
180188
//
181-
adr x9, CTRINC // ... and ROT8
189+
adr_l x9, CTRINC // ... and ROT8
182190
ld1 {v30.4s-v31.4s}, [x9]
183191

184192
// x0..15[0-3] = s0..3[0..3]
185-
mov x4, x0
186-
ld4r { v0.4s- v3.4s}, [x4], #16
187-
ld4r { v4.4s- v7.4s}, [x4], #16
188-
ld4r { v8.4s-v11.4s}, [x4], #16
189-
ld4r {v12.4s-v15.4s}, [x4]
193+
add x8, x0, #16
194+
ld4r { v0.4s- v3.4s}, [x0]
195+
ld4r { v4.4s- v7.4s}, [x8], #16
196+
ld4r { v8.4s-v11.4s}, [x8], #16
197+
ld4r {v12.4s-v15.4s}, [x8]
190198

191199
// x12 += counter values 0-3
192200
add v12.4s, v12.4s, v30.4s
@@ -430,24 +438,47 @@ ENTRY(chacha_4block_xor_neon)
430438
zip1 v30.4s, v14.4s, v15.4s
431439
zip2 v31.4s, v14.4s, v15.4s
432440

441+
mov x3, #64
442+
subs x5, x4, #64
443+
add x6, x5, x2
444+
csel x3, x3, xzr, ge
445+
csel x2, x2, x6, ge
446+
433447
// interleave 64-bit words in state n, n+2
434448
zip1 v0.2d, v16.2d, v18.2d
435449
zip2 v4.2d, v16.2d, v18.2d
436450
zip1 v8.2d, v17.2d, v19.2d
437451
zip2 v12.2d, v17.2d, v19.2d
438-
ld1 {v16.16b-v19.16b}, [x2], #64
452+
ld1 {v16.16b-v19.16b}, [x2], x3
453+
454+
subs x6, x4, #128
455+
ccmp x3, xzr, #4, lt
456+
add x7, x6, x2
457+
csel x3, x3, xzr, eq
458+
csel x2, x2, x7, eq
439459

440460
zip1 v1.2d, v20.2d, v22.2d
441461
zip2 v5.2d, v20.2d, v22.2d
442462
zip1 v9.2d, v21.2d, v23.2d
443463
zip2 v13.2d, v21.2d, v23.2d
444-
ld1 {v20.16b-v23.16b}, [x2], #64
464+
ld1 {v20.16b-v23.16b}, [x2], x3
465+
466+
subs x7, x4, #192
467+
ccmp x3, xzr, #4, lt
468+
add x8, x7, x2
469+
csel x3, x3, xzr, eq
470+
csel x2, x2, x8, eq
445471

446472
zip1 v2.2d, v24.2d, v26.2d
447473
zip2 v6.2d, v24.2d, v26.2d
448474
zip1 v10.2d, v25.2d, v27.2d
449475
zip2 v14.2d, v25.2d, v27.2d
450-
ld1 {v24.16b-v27.16b}, [x2], #64
476+
ld1 {v24.16b-v27.16b}, [x2], x3
477+
478+
subs x8, x4, #256
479+
ccmp x3, xzr, #4, lt
480+
add x9, x8, x2
481+
csel x2, x2, x9, eq
451482

452483
zip1 v3.2d, v28.2d, v30.2d
453484
zip2 v7.2d, v28.2d, v30.2d
@@ -456,29 +487,155 @@ ENTRY(chacha_4block_xor_neon)
456487
ld1 {v28.16b-v31.16b}, [x2]
457488

458489
// xor with corresponding input, write to output
490+
tbnz x5, #63, 0f
459491
eor v16.16b, v16.16b, v0.16b
460492
eor v17.16b, v17.16b, v1.16b
461493
eor v18.16b, v18.16b, v2.16b
462494
eor v19.16b, v19.16b, v3.16b
495+
st1 {v16.16b-v19.16b}, [x1], #64
496+
497+
tbnz x6, #63, 1f
463498
eor v20.16b, v20.16b, v4.16b
464499
eor v21.16b, v21.16b, v5.16b
465-
st1 {v16.16b-v19.16b}, [x1], #64
466500
eor v22.16b, v22.16b, v6.16b
467501
eor v23.16b, v23.16b, v7.16b
502+
st1 {v20.16b-v23.16b}, [x1], #64
503+
504+
tbnz x7, #63, 2f
468505
eor v24.16b, v24.16b, v8.16b
469506
eor v25.16b, v25.16b, v9.16b
470-
st1 {v20.16b-v23.16b}, [x1], #64
471507
eor v26.16b, v26.16b, v10.16b
472508
eor v27.16b, v27.16b, v11.16b
473-
eor v28.16b, v28.16b, v12.16b
474509
st1 {v24.16b-v27.16b}, [x1], #64
510+
511+
tbnz x8, #63, 3f
512+
eor v28.16b, v28.16b, v12.16b
475513
eor v29.16b, v29.16b, v13.16b
476514
eor v30.16b, v30.16b, v14.16b
477515
eor v31.16b, v31.16b, v15.16b
478516
st1 {v28.16b-v31.16b}, [x1]
479517

480518
ret
519+
520+
// fewer than 64 bytes of in/output
521+
0: ld1 {v8.16b}, [x10]
522+
ld1 {v9.16b}, [x11]
523+
movi v10.16b, #16
524+
sub x2, x1, #64
525+
add x1, x1, x5
526+
ld1 {v16.16b-v19.16b}, [x2]
527+
tbl v4.16b, {v0.16b-v3.16b}, v8.16b
528+
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
529+
add v8.16b, v8.16b, v10.16b
530+
add v9.16b, v9.16b, v10.16b
531+
tbl v5.16b, {v0.16b-v3.16b}, v8.16b
532+
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
533+
add v8.16b, v8.16b, v10.16b
534+
add v9.16b, v9.16b, v10.16b
535+
tbl v6.16b, {v0.16b-v3.16b}, v8.16b
536+
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
537+
add v8.16b, v8.16b, v10.16b
538+
add v9.16b, v9.16b, v10.16b
539+
tbl v7.16b, {v0.16b-v3.16b}, v8.16b
540+
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
541+
542+
eor v20.16b, v20.16b, v4.16b
543+
eor v21.16b, v21.16b, v5.16b
544+
eor v22.16b, v22.16b, v6.16b
545+
eor v23.16b, v23.16b, v7.16b
546+
st1 {v20.16b-v23.16b}, [x1]
547+
ret
548+
549+
// fewer than 128 bytes of in/output
550+
1: ld1 {v8.16b}, [x10]
551+
ld1 {v9.16b}, [x11]
552+
movi v10.16b, #16
553+
add x1, x1, x6
554+
tbl v0.16b, {v4.16b-v7.16b}, v8.16b
555+
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
556+
add v8.16b, v8.16b, v10.16b
557+
add v9.16b, v9.16b, v10.16b
558+
tbl v1.16b, {v4.16b-v7.16b}, v8.16b
559+
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
560+
add v8.16b, v8.16b, v10.16b
561+
add v9.16b, v9.16b, v10.16b
562+
tbl v2.16b, {v4.16b-v7.16b}, v8.16b
563+
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
564+
add v8.16b, v8.16b, v10.16b
565+
add v9.16b, v9.16b, v10.16b
566+
tbl v3.16b, {v4.16b-v7.16b}, v8.16b
567+
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
568+
569+
eor v20.16b, v20.16b, v0.16b
570+
eor v21.16b, v21.16b, v1.16b
571+
eor v22.16b, v22.16b, v2.16b
572+
eor v23.16b, v23.16b, v3.16b
573+
st1 {v20.16b-v23.16b}, [x1]
574+
ret
575+
576+
// fewer than 192 bytes of in/output
577+
2: ld1 {v4.16b}, [x10]
578+
ld1 {v5.16b}, [x11]
579+
movi v6.16b, #16
580+
add x1, x1, x7
581+
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
582+
tbx v24.16b, {v20.16b-v23.16b}, v5.16b
583+
add v4.16b, v4.16b, v6.16b
584+
add v5.16b, v5.16b, v6.16b
585+
tbl v1.16b, {v8.16b-v11.16b}, v4.16b
586+
tbx v25.16b, {v20.16b-v23.16b}, v5.16b
587+
add v4.16b, v4.16b, v6.16b
588+
add v5.16b, v5.16b, v6.16b
589+
tbl v2.16b, {v8.16b-v11.16b}, v4.16b
590+
tbx v26.16b, {v20.16b-v23.16b}, v5.16b
591+
add v4.16b, v4.16b, v6.16b
592+
add v5.16b, v5.16b, v6.16b
593+
tbl v3.16b, {v8.16b-v11.16b}, v4.16b
594+
tbx v27.16b, {v20.16b-v23.16b}, v5.16b
595+
596+
eor v24.16b, v24.16b, v0.16b
597+
eor v25.16b, v25.16b, v1.16b
598+
eor v26.16b, v26.16b, v2.16b
599+
eor v27.16b, v27.16b, v3.16b
600+
st1 {v24.16b-v27.16b}, [x1]
601+
ret
602+
603+
// fewer than 256 bytes of in/output
604+
3: ld1 {v4.16b}, [x10]
605+
ld1 {v5.16b}, [x11]
606+
movi v6.16b, #16
607+
add x1, x1, x8
608+
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
609+
tbx v28.16b, {v24.16b-v27.16b}, v5.16b
610+
add v4.16b, v4.16b, v6.16b
611+
add v5.16b, v5.16b, v6.16b
612+
tbl v1.16b, {v12.16b-v15.16b}, v4.16b
613+
tbx v29.16b, {v24.16b-v27.16b}, v5.16b
614+
add v4.16b, v4.16b, v6.16b
615+
add v5.16b, v5.16b, v6.16b
616+
tbl v2.16b, {v12.16b-v15.16b}, v4.16b
617+
tbx v30.16b, {v24.16b-v27.16b}, v5.16b
618+
add v4.16b, v4.16b, v6.16b
619+
add v5.16b, v5.16b, v6.16b
620+
tbl v3.16b, {v12.16b-v15.16b}, v4.16b
621+
tbx v31.16b, {v24.16b-v27.16b}, v5.16b
622+
623+
eor v28.16b, v28.16b, v0.16b
624+
eor v29.16b, v29.16b, v1.16b
625+
eor v30.16b, v30.16b, v2.16b
626+
eor v31.16b, v31.16b, v3.16b
627+
st1 {v28.16b-v31.16b}, [x1]
628+
ret
481629
ENDPROC(chacha_4block_xor_neon)
482630

631+
.section ".rodata", "a", %progbits
632+
.align L1_CACHE_SHIFT
633+
.Lpermute:
634+
.set .Li, 0
635+
.rept 192
636+
.byte (.Li - 64)
637+
.set .Li, .Li + 1
638+
.endr
639+
483640
CTRINC: .word 0, 1, 2, 3
484641
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f

arch/arm64/crypto/chacha-neon-glue.c

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -32,41 +32,29 @@
3232
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
3333
int nrounds);
3434
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
35-
int nrounds);
35+
int nrounds, int bytes);
3636
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
3737

3838
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
39-
unsigned int bytes, int nrounds)
39+
int bytes, int nrounds)
4040
{
4141
u8 buf[CHACHA_BLOCK_SIZE];
4242

43-
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
44-
kernel_neon_begin();
45-
chacha_4block_xor_neon(state, dst, src, nrounds);
46-
kernel_neon_end();
43+
if (bytes < CHACHA_BLOCK_SIZE) {
44+
memcpy(buf, src, bytes);
45+
chacha_block_xor_neon(state, buf, buf, nrounds);
46+
memcpy(dst, buf, bytes);
47+
return;
48+
}
49+
50+
while (bytes > 0) {
51+
chacha_4block_xor_neon(state, dst, src, nrounds,
52+
min(bytes, CHACHA_BLOCK_SIZE * 4));
4753
bytes -= CHACHA_BLOCK_SIZE * 4;
4854
src += CHACHA_BLOCK_SIZE * 4;
4955
dst += CHACHA_BLOCK_SIZE * 4;
5056
state[12] += 4;
5157
}
52-
53-
if (!bytes)
54-
return;
55-
56-
kernel_neon_begin();
57-
while (bytes >= CHACHA_BLOCK_SIZE) {
58-
chacha_block_xor_neon(state, dst, src, nrounds);
59-
bytes -= CHACHA_BLOCK_SIZE;
60-
src += CHACHA_BLOCK_SIZE;
61-
dst += CHACHA_BLOCK_SIZE;
62-
state[12]++;
63-
}
64-
if (bytes) {
65-
memcpy(buf, src, bytes);
66-
chacha_block_xor_neon(state, buf, buf, nrounds);
67-
memcpy(dst, buf, bytes);
68-
}
69-
kernel_neon_end();
7058
}
7159

7260
static int chacha_neon_stream_xor(struct skcipher_request *req,
@@ -86,8 +74,10 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
8674
if (nbytes < walk.total)
8775
nbytes = round_down(nbytes, walk.stride);
8876

77+
kernel_neon_begin();
8978
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
9079
nbytes, ctx->nrounds);
80+
kernel_neon_end();
9181
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
9282
}
9383

0 commit comments

Comments
 (0)