-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharith_arm64.s
374 lines (365 loc) · 6.47 KB
/
arith_arm64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
MOVD x_base+24(FP), R1
MOVD y_base+48(FP), R2
MOVD z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
LSR $2, R0
ADDS ZR, R0 // clear carry
loop1:
CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
ADCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
ADCS R8, R4
ADCS R9, R5
ADCS R10, R6
ADCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
ADC ZR, ZR, R1 // save & convert add carry
MOVD R1, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
MOVD x_base+24(FP), R1
MOVD y_base+48(FP), R2
MOVD z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
LSR $2, R0
SUBS ZR, R0 // clear carry
loop1:
CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
SBCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
SBCS R8, R4
SBCS R9, R5
SBCS R10, R6
SBCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
SBC R1, R1 // save carry
SUB R1, ZR, R1 // convert sub carry
MOVD R1, c+72(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
CBZ R0, ret0
MOVD s+48(FP), R1
MOVD x_base+24(FP), R2
MOVD z_base+0(FP), R3
// run loop backward
ADD R0<<3, R2, R2
ADD R0<<3, R3, R3
// shift first word into carry
MOVD.W -8(R2), R4
MOVD $64, R5
SUB R1, R5
LSR R5, R4, R6
LSL R1, R4
MOVD R6, c+56(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
LSR $2, R0
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.W -8(R2), R7
LSR R5, R7, R8
ORR R4, R8
LSL R1, R7, R4
MOVD.W R8, -8(R3)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.W -32(R2), (R9, R8)
LDP 16(R2), (R7, R6)
LSR R5, R6, R10
ORR R4, R10
LSL R1, R6, R4
LSR R5, R7, R6
ORR R4, R6
LSL R1, R7, R4
LSR R5, R8, R7
ORR R4, R7
LSL R1, R8, R4
LSR R5, R9, R8
ORR R4, R8
LSL R1, R9, R4
STP.W (R8, R7), -32(R3)
STP (R6, R10), 16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
// store final shifted bits
MOVD.W R4, -8(R3)
RET
ret0:
MOVD ZR, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
CBZ R0, ret0
MOVD s+48(FP), R1
MOVD x_base+24(FP), R2
MOVD z_base+0(FP), R3
// shift first word into carry
MOVD.P 8(R2), R4
MOVD $64, R5
SUB R1, R5
LSL R5, R4, R6
LSR R1, R4
MOVD R6, c+56(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
LSR $2, R0
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R2), R7
LSL R5, R7, R8
ORR R4, R8
LSR R1, R7, R4
MOVD.P R8, 8(R3)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R2), (R6, R7)
LDP -16(R2), (R8, R9)
LSL R5, R6, R10
ORR R4, R10
LSR R1, R6, R4
LSL R5, R7, R6
ORR R4, R6
LSR R1, R7, R4
LSL R5, R8, R7
ORR R4, R7
LSR R1, R8, R4
LSL R5, R9, R8
ORR R4, R8
LSR R1, R9, R4
STP.P (R10, R6), 32(R3)
STP (R7, R8), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
// store final shifted bits
MOVD.P R4, 8(R3)
RET
ret0:
MOVD ZR, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD m+48(FP), R0
MOVD a+56(FP), R1
MOVD z_len+8(FP), R2
MOVD x_base+24(FP), R3
MOVD z_base+0(FP), R4
// compute unrolled loop lengths
AND $7, R2, R5
LSR $3, R2
loop1:
CBZ R5, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R3), R6
// multiply
UMULH R0, R6, R7
MUL R0, R6
ADDS R1, R6
ADC ZR, R7, R1
MOVD.P R6, 8(R4)
SUB $1, R5
CBNZ R5, loop1cont
loop1done:
loop8:
CBZ R2, loop8done
loop8cont:
// unroll 8X
LDP.P 64(R3), (R5, R6)
LDP -48(R3), (R7, R8)
LDP -32(R3), (R9, R10)
LDP -16(R3), (R11, R12)
// multiply
UMULH R0, R5, R13
MUL R0, R5
ADDS R1, R5
UMULH R0, R6, R14
MUL R0, R6
ADCS R13, R6
UMULH R0, R7, R13
MUL R0, R7
ADCS R14, R7
UMULH R0, R8, R14
MUL R0, R8
ADCS R13, R8
UMULH R0, R9, R13
MUL R0, R9
ADCS R14, R9
UMULH R0, R10, R14
MUL R0, R10
ADCS R13, R10
UMULH R0, R11, R13
MUL R0, R11
ADCS R14, R11
UMULH R0, R12, R14
MUL R0, R12
ADCS R13, R12
ADC ZR, R14, R1
STP.P (R5, R6), 64(R4)
STP (R7, R8), -48(R4)
STP (R9, R10), -32(R4)
STP (R11, R12), -16(R4)
SUB $1, R2
CBNZ R2, loop8cont
loop8done:
MOVD R1, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD m+72(FP), R0
MOVD a+80(FP), R1
MOVD z_len+8(FP), R2
MOVD x_base+24(FP), R3
MOVD y_base+48(FP), R4
MOVD z_base+0(FP), R5
// compute unrolled loop lengths
AND $7, R2, R6
LSR $3, R2
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R3), R7
MOVD.P 8(R4), R8
// multiply
UMULH R0, R8, R9
MUL R0, R8
ADDS R1, R8
ADC ZR, R9, R1
// add
ADDS R7, R8
ADC ZR, R1
MOVD.P R8, 8(R5)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop8:
CBZ R2, loop8done
loop8cont:
// unroll 8X
LDP.P 64(R3), (R6, R7)
LDP -48(R3), (R8, R9)
LDP -32(R3), (R10, R11)
LDP -16(R3), (R12, R13)
LDP.P 64(R4), (R14, R15)
LDP -48(R4), (R16, R17)
LDP -32(R4), (R19, R20)
LDP -16(R4), (R21, R22)
// multiply
UMULH R0, R14, R23
MUL R0, R14
ADDS R1, R14
UMULH R0, R15, R24
MUL R0, R15
ADCS R23, R15
UMULH R0, R16, R23
MUL R0, R16
ADCS R24, R16
UMULH R0, R17, R24
MUL R0, R17
ADCS R23, R17
UMULH R0, R19, R23
MUL R0, R19
ADCS R24, R19
UMULH R0, R20, R24
MUL R0, R20
ADCS R23, R20
UMULH R0, R21, R23
MUL R0, R21
ADCS R24, R21
UMULH R0, R22, R24
MUL R0, R22
ADCS R23, R22
ADC ZR, R24, R1
// add
ADDS R6, R14
ADCS R7, R15
ADCS R8, R16
ADCS R9, R17
ADCS R10, R19
ADCS R11, R20
ADCS R12, R21
ADCS R13, R22
ADC ZR, R1
STP.P (R14, R15), 64(R5)
STP (R16, R17), -48(R5)
STP (R19, R20), -32(R5)
STP (R21, R22), -16(R5)
SUB $1, R2
CBNZ R2, loop8cont
loop8done:
MOVD R1, c+88(FP)
RET