Skip to content

Commit 2e5d2f3

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/aes-blk - improve XTS mask handling
The Crypto Extension instantiation of the aes-modes.S collection of skciphers uses only 15 NEON registers for the round key array, whereas the pure NEON flavor uses 16 NEON registers for the AES S-box. This means we have a spare register available that we can use to hold the XTS mask vector, removing the need to reload it at every iteration of the inner loop. Since the pure NEON version does not permit this optimization, tweak the macros so we can factor out this functionality. Also, replace the literal load with a short sequence to compose the mask vector. On Cortex-A53, this results in a ~4% speedup. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent dd597fb commit 2e5d2f3

File tree

3 files changed

+32
-19
lines changed

3 files changed

+32
-19
lines changed

arch/arm64/crypto/aes-ce.S

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717

1818
.arch armv8-a+crypto
1919

20+
xtsmask .req v16
21+
22+
.macro xts_reload_mask, tmp
23+
.endm
24+
2025
/* preload all round keys */
2126
.macro load_round_keys, rounds, rk
2227
cmp \rounds, #12

arch/arm64/crypto/aes-modes.S

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -340,17 +340,19 @@ AES_ENDPROC(aes_ctr_encrypt)
340340
* int blocks, u8 const rk2[], u8 iv[], int first)
341341
*/
342342

343-
.macro next_tweak, out, in, const, tmp
343+
.macro next_tweak, out, in, tmp
344344
sshr \tmp\().2d, \in\().2d, #63
345-
and \tmp\().16b, \tmp\().16b, \const\().16b
345+
and \tmp\().16b, \tmp\().16b, xtsmask.16b
346346
add \out\().2d, \in\().2d, \in\().2d
347347
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
348348
eor \out\().16b, \out\().16b, \tmp\().16b
349349
.endm
350350

351-
.Lxts_mul_x:
352-
CPU_LE( .quad 1, 0x87 )
353-
CPU_BE( .quad 0x87, 1 )
351+
.macro xts_load_mask, tmp
352+
movi xtsmask.2s, #0x1
353+
movi \tmp\().2s, #0x87
354+
uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
355+
.endm
354356

355357
AES_ENTRY(aes_xts_encrypt)
356358
stp x29, x30, [sp, #-16]!
@@ -362,24 +364,24 @@ AES_ENTRY(aes_xts_encrypt)
362364
enc_prepare w3, x5, x8
363365
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
364366
enc_switch_key w3, x2, x8
365-
ldr q7, .Lxts_mul_x
367+
xts_load_mask v8
366368
b .LxtsencNx
367369

368370
.Lxtsencnotfirst:
369371
enc_prepare w3, x2, x8
370372
.LxtsencloopNx:
371-
ldr q7, .Lxts_mul_x
372-
next_tweak v4, v4, v7, v8
373+
xts_reload_mask v8
374+
next_tweak v4, v4, v8
373375
.LxtsencNx:
374376
subs w4, w4, #4
375377
bmi .Lxtsenc1x
376378
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
377-
next_tweak v5, v4, v7, v8
379+
next_tweak v5, v4, v8
378380
eor v0.16b, v0.16b, v4.16b
379-
next_tweak v6, v5, v7, v8
381+
next_tweak v6, v5, v8
380382
eor v1.16b, v1.16b, v5.16b
381383
eor v2.16b, v2.16b, v6.16b
382-
next_tweak v7, v6, v7, v8
384+
next_tweak v7, v6, v8
383385
eor v3.16b, v3.16b, v7.16b
384386
bl aes_encrypt_block4x
385387
eor v3.16b, v3.16b, v7.16b
@@ -401,7 +403,7 @@ AES_ENTRY(aes_xts_encrypt)
401403
st1 {v0.16b}, [x0], #16
402404
subs w4, w4, #1
403405
beq .Lxtsencout
404-
next_tweak v4, v4, v7, v8
406+
next_tweak v4, v4, v8
405407
b .Lxtsencloop
406408
.Lxtsencout:
407409
st1 {v4.16b}, [x6]
@@ -420,24 +422,24 @@ AES_ENTRY(aes_xts_decrypt)
420422
enc_prepare w3, x5, x8
421423
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
422424
dec_prepare w3, x2, x8
423-
ldr q7, .Lxts_mul_x
425+
xts_load_mask v8
424426
b .LxtsdecNx
425427

426428
.Lxtsdecnotfirst:
427429
dec_prepare w3, x2, x8
428430
.LxtsdecloopNx:
429-
ldr q7, .Lxts_mul_x
430-
next_tweak v4, v4, v7, v8
431+
xts_reload_mask v8
432+
next_tweak v4, v4, v8
431433
.LxtsdecNx:
432434
subs w4, w4, #4
433435
bmi .Lxtsdec1x
434436
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
435-
next_tweak v5, v4, v7, v8
437+
next_tweak v5, v4, v8
436438
eor v0.16b, v0.16b, v4.16b
437-
next_tweak v6, v5, v7, v8
439+
next_tweak v6, v5, v8
438440
eor v1.16b, v1.16b, v5.16b
439441
eor v2.16b, v2.16b, v6.16b
440-
next_tweak v7, v6, v7, v8
442+
next_tweak v7, v6, v8
441443
eor v3.16b, v3.16b, v7.16b
442444
bl aes_decrypt_block4x
443445
eor v3.16b, v3.16b, v7.16b
@@ -459,7 +461,7 @@ AES_ENTRY(aes_xts_decrypt)
459461
st1 {v0.16b}, [x0], #16
460462
subs w4, w4, #1
461463
beq .Lxtsdecout
462-
next_tweak v4, v4, v7, v8
464+
next_tweak v4, v4, v8
463465
b .Lxtsdecloop
464466
.Lxtsdecout:
465467
st1 {v4.16b}, [x6]

arch/arm64/crypto/aes-neon.S

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
#define AES_ENTRY(func) ENTRY(neon_ ## func)
1515
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
1616

17+
xtsmask .req v7
18+
19+
.macro xts_reload_mask, tmp
20+
xts_load_mask \tmp
21+
.endm
22+
1723
/* multiply by polynomial 'x' in GF(2^8) */
1824
.macro mul_by_x, out, in, temp, const
1925
sshr \temp, \in, #7

0 commit comments

Comments
 (0)