-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathequal_loong64.s
273 lines (252 loc) · 4.73 KB
/
equal_loong64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
#define REGCTXT R29
// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
// R4 = a_base
// R5 = b_base
// R6 = size
JMP equalbody<>(SB)
// memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0
// R4 = a_base
// R5 = b_base
MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure
JMP equalbody<>(SB)
// input:
// R4 = a_base
// R5 = b_base
// R6 = size
TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
// a_base == b_base
BEQ R4, R5, equal
// 0 bytes
BEQ R6, equal
MOVV $64, R7
BGE R6, R7, lasx
// size < 64 bytes
tail:
MOVV $16, R7
BLT R6, R7, lt_16
generic16_loop:
ADDV $-16, R6
MOVV 0(R4), R8
MOVV 8(R4), R9
MOVV 0(R5), R10
MOVV 8(R5), R11
BNE R8, R10, not_equal
BNE R9, R11, not_equal
BEQ R6, equal
ADDV $16, R4
ADDV $16, R5
BGE R6, R7, generic16_loop
// size < 16 bytes
lt_16:
MOVV $8, R7
BLT R6, R7, lt_8
ADDV $-8, R6
MOVV 0(R4), R8
MOVV 0(R5), R9
BNE R8, R9, not_equal
BEQ R6, equal
ADDV $8, R4
ADDV $8, R5
// size < 8 bytes
lt_8:
MOVV $4, R7
BLT R6, R7, lt_4
ADDV $-4, R6
MOVW 0(R4), R8
MOVW 0(R5), R9
BNE R8, R9, not_equal
BEQ R6, equal
ADDV $4, R4
ADDV $4, R5
// size < 4 bytes
lt_4:
MOVV $2, R7
BLT R6, R7, lt_2
ADDV $-2, R6
MOVH 0(R4), R8
MOVH 0(R5), R9
BNE R8, R9, not_equal
BEQ R6, equal
ADDV $2, R4
ADDV $2, R5
// size < 2 bytes
lt_2:
MOVB 0(R4), R8
MOVB 0(R5), R9
BNE R8, R9, not_equal
equal:
MOVV $1, R4
RET
not_equal:
MOVV R0, R4
RET
// Implemented using 256-bit SIMD instructions
lasx:
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
BEQ R7, lsx
lasx256:
MOVV $256, R7
BLT R6, R7, lasx64
lasx256_loop:
ADDV $-256, R6
XVMOVQ 0(R4), X0
XVMOVQ 32(R4), X1
XVMOVQ 64(R4), X2
XVMOVQ 96(R4), X3
XVMOVQ 128(R4), X4
XVMOVQ 160(R4), X5
XVMOVQ 192(R4), X6
XVMOVQ 224(R4), X7
XVMOVQ 0(R5), X8
XVMOVQ 32(R5), X9
XVMOVQ 64(R5), X10
XVMOVQ 96(R5), X11
XVMOVQ 128(R5), X12
XVMOVQ 160(R5), X13
XVMOVQ 192(R5), X14
XVMOVQ 224(R5), X15
XVSEQV X0, X8, X0
XVSEQV X1, X9, X1
XVSEQV X2, X10, X2
XVSEQV X3, X11, X3
XVSEQV X4, X12, X4
XVSEQV X5, X13, X5
XVSEQV X6, X14, X6
XVSEQV X7, X15, X7
XVANDV X0, X1, X0
XVANDV X2, X3, X2
XVANDV X4, X5, X4
XVANDV X6, X7, X6
XVANDV X0, X2, X0
XVANDV X4, X6, X4
XVANDV X0, X4, X0
XVSETALLNEV X0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $256, R4
ADDV $256, R5
BGE R6, R7, lasx256_loop
lasx64:
MOVV $64, R7
BLT R6, R7, tail
lasx64_loop:
ADDV $-64, R6
XVMOVQ 0(R4), X0
XVMOVQ 32(R4), X1
XVMOVQ 0(R5), X2
XVMOVQ 32(R5), X3
XVSEQV X0, X2, X0
XVSEQV X1, X3, X1
XVANDV X0, X1, X0
XVSETALLNEV X0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $64, R4
ADDV $64, R5
BGE R6, R7, lasx64_loop
JMP tail
// Implemented using 128-bit SIMD instructions
lsx:
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
BEQ R7, generic64_loop
lsx128:
MOVV $128, R7
BLT R6, R7, lsx32
lsx128_loop:
ADDV $-128, R6
VMOVQ 0(R4), V0
VMOVQ 16(R4), V1
VMOVQ 32(R4), V2
VMOVQ 48(R4), V3
VMOVQ 64(R4), V4
VMOVQ 80(R4), V5
VMOVQ 96(R4), V6
VMOVQ 112(R4), V7
VMOVQ 0(R5), V8
VMOVQ 16(R5), V9
VMOVQ 32(R5), V10
VMOVQ 48(R5), V11
VMOVQ 64(R5), V12
VMOVQ 80(R5), V13
VMOVQ 96(R5), V14
VMOVQ 112(R5), V15
VSEQV V0, V8, V0
VSEQV V1, V9, V1
VSEQV V2, V10, V2
VSEQV V3, V11, V3
VSEQV V4, V12, V4
VSEQV V5, V13, V5
VSEQV V6, V14, V6
VSEQV V7, V15, V7
VANDV V0, V1, V0
VANDV V2, V3, V2
VANDV V4, V5, V4
VANDV V6, V7, V6
VANDV V0, V2, V0
VANDV V4, V6, V4
VANDV V0, V4, V0
VSETALLNEV V0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $128, R4
ADDV $128, R5
BGE R6, R7, lsx128_loop
lsx32:
MOVV $32, R7
BLT R6, R7, tail
lsx32_loop:
ADDV $-32, R6
VMOVQ 0(R4), V0
VMOVQ 16(R4), V1
VMOVQ 0(R5), V2
VMOVQ 16(R5), V3
VSEQV V0, V2, V0
VSEQV V1, V3, V1
VANDV V0, V1, V0
VSETALLNEV V0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $32, R4
ADDV $32, R5
BGE R6, R7, lsx32_loop
JMP tail
// Implemented using general instructions
generic64_loop:
ADDV $-64, R6
MOVV 0(R4), R7
MOVV 8(R4), R8
MOVV 16(R4), R9
MOVV 24(R4), R10
MOVV 0(R5), R15
MOVV 8(R5), R16
MOVV 16(R5), R17
MOVV 24(R5), R18
BNE R7, R15, not_equal
BNE R8, R16, not_equal
BNE R9, R17, not_equal
BNE R10, R18, not_equal
MOVV 32(R4), R11
MOVV 40(R4), R12
MOVV 48(R4), R13
MOVV 56(R4), R14
MOVV 32(R5), R19
MOVV 40(R5), R20
MOVV 48(R5), R21
MOVV 56(R5), R23
BNE R11, R19, not_equal
BNE R12, R20, not_equal
BNE R13, R21, not_equal
BNE R14, R23, not_equal
BEQ R6, equal
ADDV $64, R4
ADDV $64, R5
MOVV $64, R7
BGE R6, R7, generic64_loop
JMP tail