Skip to content

Commit 83dee2c

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize
The way the KECCAK transform is currently coded involves many references into the state array using indexes that are calculated at runtime using simple but non-trivial arithmetic. This forces the compiler to treat the state matrix as an array in memory rather than keep it in registers, which results in poor performance. So instead, let's rephrase the algorithm using fixed array indexes only. This helps the compiler keep the state matrix in registers, resulting in the following speedup (SHA3-256 performance in cycles per byte): before after speedup Intel Core i7 @ 2.0 GHz (2.9 turbo) 100.6 35.7 2.8x Cortex-A57 @ 2.0 GHz (64-bit mode) 101.6 12.7 8.0x Cortex-A53 @ 1.0 GHz 224.4 15.8 14.2x Cortex-A57 @ 2.0 GHz (32-bit mode) 201.8 63.0 3.2x Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent c013cee commit 83dee2c

File tree

1 file changed

+96
-38
lines changed

1 file changed

+96
-38
lines changed

crypto/sha3_generic.c

Lines changed: 96 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
66
*
77
* SHA-3 code by Jeff Garzik <jeff@garzik.org>
8+
* Ard Biesheuvel <ard.biesheuvel@linaro.org>
89
*
910
* This program is free software; you can redistribute it and/or modify it
1011
* under the terms of the GNU General Public License as published by the Free
@@ -22,8 +23,6 @@
2223

2324
#define KECCAK_ROUNDS 24
2425

25-
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
26-
2726
static const u64 keccakf_rndc[24] = {
2827
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
2928
0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
@@ -35,53 +34,112 @@ static const u64 keccakf_rndc[24] = {
3534
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
3635
};
3736

38-
static const int keccakf_rotc[24] = {
39-
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
40-
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
41-
};
42-
43-
static const int keccakf_piln[24] = {
44-
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
45-
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
46-
};
47-
4837
/* update the state with given number of rounds */
4938

50-
static void keccakf(u64 st[25])
39+
static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
5140
{
52-
int i, j, round;
53-
u64 t, bc[5];
41+
u64 t[5], tt, bc[5];
42+
int round;
5443

5544
for (round = 0; round < KECCAK_ROUNDS; round++) {
5645

5746
/* Theta */
58-
for (i = 0; i < 5; i++)
59-
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15]
60-
^ st[i + 20];
61-
62-
for (i = 0; i < 5; i++) {
63-
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
64-
for (j = 0; j < 25; j += 5)
65-
st[j + i] ^= t;
66-
}
47+
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
48+
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
49+
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
50+
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
51+
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
52+
53+
t[0] = bc[4] ^ rol64(bc[1], 1);
54+
t[1] = bc[0] ^ rol64(bc[2], 1);
55+
t[2] = bc[1] ^ rol64(bc[3], 1);
56+
t[3] = bc[2] ^ rol64(bc[4], 1);
57+
t[4] = bc[3] ^ rol64(bc[0], 1);
58+
59+
st[0] ^= t[0];
6760

6861
/* Rho Pi */
69-
t = st[1];
70-
for (i = 0; i < 24; i++) {
71-
j = keccakf_piln[i];
72-
bc[0] = st[j];
73-
st[j] = ROTL64(t, keccakf_rotc[i]);
74-
t = bc[0];
75-
}
62+
tt = st[1];
63+
st[ 1] = rol64(st[ 6] ^ t[1], 44);
64+
st[ 6] = rol64(st[ 9] ^ t[4], 20);
65+
st[ 9] = rol64(st[22] ^ t[2], 61);
66+
st[22] = rol64(st[14] ^ t[4], 39);
67+
st[14] = rol64(st[20] ^ t[0], 18);
68+
st[20] = rol64(st[ 2] ^ t[2], 62);
69+
st[ 2] = rol64(st[12] ^ t[2], 43);
70+
st[12] = rol64(st[13] ^ t[3], 25);
71+
st[13] = rol64(st[19] ^ t[4], 8);
72+
st[19] = rol64(st[23] ^ t[3], 56);
73+
st[23] = rol64(st[15] ^ t[0], 41);
74+
st[15] = rol64(st[ 4] ^ t[4], 27);
75+
st[ 4] = rol64(st[24] ^ t[4], 14);
76+
st[24] = rol64(st[21] ^ t[1], 2);
77+
st[21] = rol64(st[ 8] ^ t[3], 55);
78+
st[ 8] = rol64(st[16] ^ t[1], 45);
79+
st[16] = rol64(st[ 5] ^ t[0], 36);
80+
st[ 5] = rol64(st[ 3] ^ t[3], 28);
81+
st[ 3] = rol64(st[18] ^ t[3], 21);
82+
st[18] = rol64(st[17] ^ t[2], 15);
83+
st[17] = rol64(st[11] ^ t[1], 10);
84+
st[11] = rol64(st[ 7] ^ t[2], 6);
85+
st[ 7] = rol64(st[10] ^ t[0], 3);
86+
st[10] = rol64( tt ^ t[1], 1);
7687

7788
/* Chi */
78-
for (j = 0; j < 25; j += 5) {
79-
for (i = 0; i < 5; i++)
80-
bc[i] = st[j + i];
81-
for (i = 0; i < 5; i++)
82-
st[j + i] ^= (~bc[(i + 1) % 5]) &
83-
bc[(i + 2) % 5];
84-
}
89+
bc[ 0] = ~st[ 1] & st[ 2];
90+
bc[ 1] = ~st[ 2] & st[ 3];
91+
bc[ 2] = ~st[ 3] & st[ 4];
92+
bc[ 3] = ~st[ 4] & st[ 0];
93+
bc[ 4] = ~st[ 0] & st[ 1];
94+
st[ 0] ^= bc[ 0];
95+
st[ 1] ^= bc[ 1];
96+
st[ 2] ^= bc[ 2];
97+
st[ 3] ^= bc[ 3];
98+
st[ 4] ^= bc[ 4];
99+
100+
bc[ 0] = ~st[ 6] & st[ 7];
101+
bc[ 1] = ~st[ 7] & st[ 8];
102+
bc[ 2] = ~st[ 8] & st[ 9];
103+
bc[ 3] = ~st[ 9] & st[ 5];
104+
bc[ 4] = ~st[ 5] & st[ 6];
105+
st[ 5] ^= bc[ 0];
106+
st[ 6] ^= bc[ 1];
107+
st[ 7] ^= bc[ 2];
108+
st[ 8] ^= bc[ 3];
109+
st[ 9] ^= bc[ 4];
110+
111+
bc[ 0] = ~st[11] & st[12];
112+
bc[ 1] = ~st[12] & st[13];
113+
bc[ 2] = ~st[13] & st[14];
114+
bc[ 3] = ~st[14] & st[10];
115+
bc[ 4] = ~st[10] & st[11];
116+
st[10] ^= bc[ 0];
117+
st[11] ^= bc[ 1];
118+
st[12] ^= bc[ 2];
119+
st[13] ^= bc[ 3];
120+
st[14] ^= bc[ 4];
121+
122+
bc[ 0] = ~st[16] & st[17];
123+
bc[ 1] = ~st[17] & st[18];
124+
bc[ 2] = ~st[18] & st[19];
125+
bc[ 3] = ~st[19] & st[15];
126+
bc[ 4] = ~st[15] & st[16];
127+
st[15] ^= bc[ 0];
128+
st[16] ^= bc[ 1];
129+
st[17] ^= bc[ 2];
130+
st[18] ^= bc[ 3];
131+
st[19] ^= bc[ 4];
132+
133+
bc[ 0] = ~st[21] & st[22];
134+
bc[ 1] = ~st[22] & st[23];
135+
bc[ 2] = ~st[23] & st[24];
136+
bc[ 3] = ~st[24] & st[20];
137+
bc[ 4] = ~st[20] & st[21];
138+
st[20] ^= bc[ 0];
139+
st[21] ^= bc[ 1];
140+
st[22] ^= bc[ 2];
141+
st[23] ^= bc[ 3];
142+
st[24] ^= bc[ 4];
85143

86144
/* Iota */
87145
st[0] ^= keccakf_rndc[round];

0 commit comments

Comments
 (0)