Skip to content

Commit b913a64

Browse files
Ard Biesheuvelctmarinas
authored andcommitted
arm64/crypto: improve performance of GHASH algorithm
This patches modifies the GHASH secure hash implementation to switch to a faster, polynomial multiplication based reduction instead of one that uses shifts and rotates. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
1 parent 6aa8b20 commit b913a64

File tree

2 files changed

+40
-56
lines changed

2 files changed

+40
-56
lines changed

arch/arm64/crypto/ghash-ce-core.S

Lines changed: 38 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,6 @@
33
*
44
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
55
*
6-
* Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
7-
*
8-
* Copyright (c) 2009 Intel Corp.
9-
* Author: Huang Ying <ying.huang@intel.com>
10-
* Vinodh Gopal
11-
* Erdinc Ozturk
12-
* Deniz Karakoyunlu
13-
*
146
* This program is free software; you can redistribute it and/or modify it
157
* under the terms of the GNU General Public License version 2 as published
168
* by the Free Software Foundation.
@@ -19,13 +11,15 @@
1911
#include <linux/linkage.h>
2012
#include <asm/assembler.h>
2113

22-
DATA .req v0
23-
SHASH .req v1
24-
IN1 .req v2
14+
SHASH .req v0
15+
SHASH2 .req v1
2516
T1 .req v2
2617
T2 .req v3
27-
T3 .req v4
28-
VZR .req v5
18+
MASK .req v4
19+
XL .req v5
20+
XM .req v6
21+
XH .req v7
22+
IN1 .req v7
2923

3024
.text
3125
.arch armv8-a+crypto
@@ -35,61 +29,51 @@
3529
* struct ghash_key const *k, const char *head)
3630
*/
3731
ENTRY(pmull_ghash_update)
38-
ld1 {DATA.16b}, [x1]
3932
ld1 {SHASH.16b}, [x3]
40-
eor VZR.16b, VZR.16b, VZR.16b
33+
ld1 {XL.16b}, [x1]
34+
movi MASK.16b, #0xe1
35+
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
36+
shl MASK.2d, MASK.2d, #57
37+
eor SHASH2.16b, SHASH2.16b, SHASH.16b
4138

4239
/* do the head block first, if supplied */
4340
cbz x4, 0f
44-
ld1 {IN1.2d}, [x4]
41+
ld1 {T1.2d}, [x4]
4542
b 1f
4643

47-
0: ld1 {IN1.2d}, [x2], #16
44+
0: ld1 {T1.2d}, [x2], #16
4845
sub w0, w0, #1
49-
1: ext IN1.16b, IN1.16b, IN1.16b, #8
50-
CPU_LE( rev64 IN1.16b, IN1.16b )
51-
eor DATA.16b, DATA.16b, IN1.16b
5246

53-
/* multiply DATA by SHASH in GF(2^128) */
54-
ext T2.16b, DATA.16b, DATA.16b, #8
55-
ext T3.16b, SHASH.16b, SHASH.16b, #8
56-
eor T2.16b, T2.16b, DATA.16b
57-
eor T3.16b, T3.16b, SHASH.16b
47+
1: /* multiply XL by SHASH in GF(2^128) */
48+
CPU_LE( rev64 T1.16b, T1.16b )
5849

59-
pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
60-
pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
61-
pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
62-
eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
63-
eor T2.16b, T2.16b, DATA.16b
50+
ext T2.16b, XL.16b, XL.16b, #8
51+
ext IN1.16b, T1.16b, T1.16b, #8
52+
eor T1.16b, T1.16b, T2.16b
53+
eor XL.16b, XL.16b, IN1.16b
6454

65-
ext T3.16b, VZR.16b, T2.16b, #8
66-
ext T2.16b, T2.16b, VZR.16b, #8
67-
eor DATA.16b, DATA.16b, T3.16b
68-
eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
69-
// carry-less multiplication
55+
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
56+
eor T1.16b, T1.16b, XL.16b
57+
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
58+
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
7059

71-
/* first phase of the reduction */
72-
shl T3.2d, DATA.2d, #1
73-
eor T3.16b, T3.16b, DATA.16b
74-
shl T3.2d, T3.2d, #5
75-
eor T3.16b, T3.16b, DATA.16b
76-
shl T3.2d, T3.2d, #57
77-
ext T2.16b, VZR.16b, T3.16b, #8
78-
ext T3.16b, T3.16b, VZR.16b, #8
79-
eor DATA.16b, DATA.16b, T2.16b
80-
eor T1.16b, T1.16b, T3.16b
60+
ext T1.16b, XL.16b, XH.16b, #8
61+
eor T2.16b, XL.16b, XH.16b
62+
eor XM.16b, XM.16b, T1.16b
63+
eor XM.16b, XM.16b, T2.16b
64+
pmull T2.1q, XL.1d, MASK.1d
8165

82-
/* second phase of the reduction */
83-
ushr T2.2d, DATA.2d, #5
84-
eor T2.16b, T2.16b, DATA.16b
85-
ushr T2.2d, T2.2d, #1
86-
eor T2.16b, T2.16b, DATA.16b
87-
ushr T2.2d, T2.2d, #1
88-
eor T1.16b, T1.16b, T2.16b
89-
eor DATA.16b, DATA.16b, T1.16b
66+
mov XH.d[0], XM.d[1]
67+
mov XM.d[1], XL.d[0]
68+
69+
eor XL.16b, XM.16b, T2.16b
70+
ext T2.16b, XL.16b, XL.16b, #8
71+
pmull XL.1q, XL.1d, MASK.1d
72+
eor T2.16b, T2.16b, XH.16b
73+
eor XL.16b, XL.16b, T2.16b
9074

9175
cbnz w0, 0b
9276

93-
st1 {DATA.16b}, [x1]
77+
st1 {XL.16b}, [x1]
9478
ret
9579
ENDPROC(pmull_ghash_update)

arch/arm64/crypto/ghash-ce-glue.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
6767
blocks = len / GHASH_BLOCK_SIZE;
6868
len %= GHASH_BLOCK_SIZE;
6969

70-
kernel_neon_begin_partial(6);
70+
kernel_neon_begin_partial(8);
7171
pmull_ghash_update(blocks, ctx->digest, src, key,
7272
partial ? ctx->buf : NULL);
7373
kernel_neon_end();
@@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
8989

9090
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
9191

92-
kernel_neon_begin_partial(6);
92+
kernel_neon_begin_partial(8);
9393
pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
9494
kernel_neon_end();
9595
}

0 commit comments

Comments
 (0)