19
19
* u32 * macp , u8 const rk [], u32 rounds) ;
20
20
* /
21
21
ENTRY(ce_aes_ccm_auth_data)
22
- ldr w8 , [ x3 ] / * leftover from prev round? * /
22
+ frame_push 7
23
+
24
+ mov x19 , x0
25
+ mov x20 , x1
26
+ mov x21 , x2
27
+ mov x22 , x3
28
+ mov x23 , x4
29
+ mov x24 , x5
30
+
31
+ ldr w25 , [ x22 ] / * leftover from prev round? * /
23
32
ld1 {v0.16b} , [ x0 ] / * load mac * /
24
- cbz w8 , 1f
25
- sub w8 , w8 , # 16
33
+ cbz w25 , 1f
34
+ sub w25 , w25 , # 16
26
35
eor v1.16b , v1.16b , v1.16b
27
- 0 : ldrb w7 , [ x1 ], # 1 / * get 1 byte of input * /
28
- subs w2 , w2 , # 1
29
- add w8 , w8 , # 1
36
+ 0 : ldrb w7 , [ x20 ], # 1 / * get 1 byte of input * /
37
+ subs w21 , w21 , # 1
38
+ add w25 , w25 , # 1
30
39
ins v1.b [ 0 ], w7
31
40
ext v1.16b , v1.16b , v1.16b , # 1 / * rotate in the input bytes * /
32
41
beq 8f / * out of input? * /
33
- cbnz w8 , 0b
42
+ cbnz w25 , 0b
34
43
eor v0.16b , v0.16b , v1.16b
35
- 1 : ld1 {v3.4s} , [ x4 ] / * load first round key * /
36
- prfm pldl1strm , [ x1 ]
37
- cmp w5 , # 12 /* which key size? * /
38
- add x6 , x4 , # 16
39
- sub w7 , w5 , # 2 / * modified # of rounds * /
44
+ 1 : ld1 {v3.4s} , [ x23 ] / * load first round key * /
45
+ prfm pldl1strm , [ x20 ]
46
+ cmp w24 , # 12 / * which key size? * /
47
+ add x6 , x23 , # 16
48
+ sub w7 , w24 , # 2 / * modified # of rounds * /
40
49
bmi 2f
41
50
bne 5f
42
51
mov v5.16b , v3.16b
@@ -55,33 +64,43 @@ ENTRY(ce_aes_ccm_auth_data)
55
64
ld1 {v5.4s} , [ x6 ], # 16 / * load next round key * /
56
65
bpl 3b
57
66
aese v0.16b , v4.16b
58
- subs w2 , w2 , # 16 / * last data? * /
67
+ subs w21 , w21 , # 16 / * last data? * /
59
68
eor v0.16b , v0.16b , v5.16b / * final round * /
60
69
bmi 6f
61
- ld1 {v1.16b} , [ x1 ], # 16 / * load next input block * /
70
+ ld1 {v1.16b} , [ x20 ], # 16 / * load next input block * /
62
71
eor v0.16b , v0.16b , v1.16b / * xor with mac * /
63
- bne 1b
64
- 6 : st1 {v0.16b} , [ x0 ] / * store mac * /
72
+ beq 6f
73
+
74
+ if_will_cond_yield_neon
75
+ st1 {v0.16b} , [ x19 ] / * store mac * /
76
+ do_cond_yield_neon
77
+ ld1 {v0.16b} , [ x19 ] / * reload mac * /
78
+ endif_yield_neon
79
+
80
+ b 1b
81
+ 6 : st1 {v0.16b} , [ x19 ] / * store mac * /
65
82
beq 10f
66
- adds w2 , w2 , # 16
83
+ adds w21 , w21 , # 16
67
84
beq 10f
68
- mov w8 , w2
69
- 7 : ldrb w7 , [ x1 ], # 1
85
+ mov w25 , w21
86
+ 7 : ldrb w7 , [ x20 ], # 1
70
87
umov w6 , v0.b [ 0 ]
71
88
eor w6 , w6 , w7
72
- strb w6 , [ x0 ], # 1
73
- subs w2 , w2 , # 1
89
+ strb w6 , [ x19 ], # 1
90
+ subs w21 , w21 , # 1
74
91
beq 10f
75
92
ext v0.16b , v0.16b , v0.16b , # 1 / * rotate out the mac bytes * /
76
93
b 7b
77
- 8 : mov w7 , w8
78
- add w8 , w8 , # 16
94
+ 8 : mov w7 , w25
95
+ add w25 , w25 , # 16
79
96
9 : ext v1.16b , v1.16b , v1.16b , # 1
80
97
adds w7 , w7 , # 1
81
98
bne 9b
82
99
eor v0.16b , v0.16b , v1.16b
83
- st1 {v0.16b} , [ x0 ]
84
- 10 : str w8 , [ x3 ]
100
+ st1 {v0.16b} , [ x19 ]
101
+ 10 : str w25 , [ x22 ]
102
+
103
+ frame_pop
85
104
ret
86
105
ENDPROC(ce_aes_ccm_auth_data)
87
106
@@ -126,19 +145,29 @@ ENTRY(ce_aes_ccm_final)
126
145
ENDPROC(ce_aes_ccm_final)
127
146
128
147
.macro aes_ccm_do_crypt , enc
129
- ldr x8 , [ x6 , # 8 ] / * load lower ctr * /
130
- ld1 {v0.16b} , [ x5 ] / * load mac * /
131
- CPU_LE( rev x8 , x8 ) / * keep swabbed ctr in reg * /
148
+ frame_push 8
149
+
150
+ mov x19 , x0
151
+ mov x20 , x1
152
+ mov x21 , x2
153
+ mov x22 , x3
154
+ mov x23 , x4
155
+ mov x24 , x5
156
+ mov x25 , x6
157
+
158
+ ldr x26 , [ x25 , # 8 ] / * load lower ctr * /
159
+ ld1 {v0.16b} , [ x24 ] / * load mac * /
160
+ CPU_LE( rev x26 , x26 ) / * keep swabbed ctr in reg * /
132
161
0 : / * outer loop * /
133
- ld1 {v1.8b} , [ x6 ] / * load upper ctr * /
134
- prfm pldl1strm , [ x1 ]
135
- add x8 , x8 , # 1
136
- rev x9 , x8
137
- cmp w4 , # 12 /* which key size? * /
138
- sub w7 , w4 , # 2 / * get modified # of rounds * /
162
+ ld1 {v1.8b} , [ x25 ] / * load upper ctr * /
163
+ prfm pldl1strm , [ x20 ]
164
+ add x26 , x26 , # 1
165
+ rev x9 , x26
166
+ cmp w23 , # 12 / * which key size? * /
167
+ sub w7 , w23 , # 2 / * get modified # of rounds * /
139
168
ins v1.d [ 1 ], x9 / * no carry in lower ctr * /
140
- ld1 {v3.4s} , [ x3 ] / * load first round key * /
141
- add x10 , x3 , # 16
169
+ ld1 {v3.4s} , [ x22 ] / * load first round key * /
170
+ add x10 , x22 , # 16
142
171
bmi 1f
143
172
bne 4f
144
173
mov v5.16b , v3.16b
@@ -165,9 +194,9 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
165
194
bpl 2b
166
195
aese v0.16b , v4.16b
167
196
aese v1.16b , v4.16b
168
- subs w2 , w2 , # 16
169
- bmi 6f / * partial block? * /
170
- ld1 {v2.16b} , [ x1 ], # 16 / * load next input block * /
197
+ subs w21 , w21 , # 16
198
+ bmi 7f / * partial block? * /
199
+ ld1 {v2.16b} , [ x20 ], # 16 / * load next input block * /
171
200
.if \enc == 1
172
201
eor v2.16b , v2.16b , v5.16b / * final round enc + mac * /
173
202
eor v1.16b , v1.16b , v2.16b / * xor with crypted ctr * /
@@ -176,18 +205,29 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
176
205
eor v1.16b , v2.16b , v5.16b / * final round enc * /
177
206
.endif
178
207
eor v0.16b , v0.16b , v2.16b / * xor mac with pt ^ rk [ last ] * /
179
- st1 {v1.16b} , [ x0 ], # 16 / * write output block * /
180
- bne 0b
181
- CPU_LE( rev x8 , x8 )
182
- st1 {v0.16b} , [ x5 ] / * store mac * /
183
- str x8 , [ x6 , # 8 ] / * store lsb end of ctr (BE) * /
184
- 5 : ret
185
-
186
- 6 : eor v0.16b , v0.16b , v5.16b / * final round mac * /
208
+ st1 {v1.16b} , [ x19 ], # 16 / * write output block * /
209
+ beq 5f
210
+
211
+ if_will_cond_yield_neon
212
+ st1 {v0.16b} , [ x24 ] / * store mac * /
213
+ do_cond_yield_neon
214
+ ld1 {v0.16b} , [ x24 ] / * reload mac * /
215
+ endif_yield_neon
216
+
217
+ b 0b
218
+ 5 :
219
+ CPU_LE( rev x26 , x26 )
220
+ st1 {v0.16b} , [ x24 ] / * store mac * /
221
+ str x26 , [ x25 , # 8 ] / * store lsb end of ctr (BE) * /
222
+
223
+ 6 : frame_pop
224
+ ret
225
+
226
+ 7 : eor v0.16b , v0.16b , v5.16b / * final round mac * /
187
227
eor v1.16b , v1.16b , v5.16b / * final round enc * /
188
- st1 {v0.16b} , [ x5 ] / * store mac * /
189
- add w2 , w2 , # 16 / * process partial tail block * /
190
- 7 : ldrb w9 , [ x1 ], # 1 / * get 1 byte of input * /
228
+ st1 {v0.16b} , [ x24 ] / * store mac * /
229
+ add w21 , w21 , # 16 / * process partial tail block * /
230
+ 8 : ldrb w9 , [ x20 ], # 1 / * get 1 byte of input * /
191
231
umov w6 , v1.b [ 0 ] / * get top crypted ctr byte * /
192
232
umov w7 , v0.b [ 0 ] / * get top mac byte * /
193
233
.if \enc == 1
@@ -197,13 +237,13 @@ CPU_LE( rev x8, x8 )
197
237
eor w9 , w9 , w6
198
238
eor w7 , w7 , w9
199
239
.endif
200
- strb w9 , [ x0 ], # 1 / * store out byte * /
201
- strb w7 , [ x5 ], # 1 / * store mac byte * /
202
- subs w2 , w2 , # 1
203
- beq 5b
240
+ strb w9 , [ x19 ], # 1 / * store out byte * /
241
+ strb w7 , [ x24 ], # 1 / * store mac byte * /
242
+ subs w21 , w21 , # 1
243
+ beq 6b
204
244
ext v0.16b , v0.16b , v0.16b , # 1 / * shift out mac byte * /
205
245
ext v1.16b , v1.16b , v1.16b , # 1 / * shift out ctr byte * /
206
- b 7b
246
+ b 8b
207
247
.endm
208
248
209
249
/ *
0 commit comments