-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_arm64.s
94 lines (91 loc) · 2.09 KB
/
count_arm64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// func Count(b []byte, c byte) int
// input:
// R0: b ptr
// R1: b len
// R2: b cap
// R3: c byte to search
// return:
// R0: result
TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
MOVD R3, R2
B ·CountString<ABIInternal>(SB)
// func CountString(s string, c byte) int
// input:
// R0: s ptr
// R1: s len
// R2: c byte to search (due to ABIInternal upper bits can contain junk)
// return:
// R0: result
TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
// R11 = count of byte to search
MOVD $0, R11
// short path to handle 0-byte case
CBZ R1, done
CMP $0x20, R1
// jump directly to head if length >= 32
BHS head
tail:
// Work with tail shorter than 32 bytes
MOVBU.P 1(R0), R5
SUB $1, R1, R1
CMP R2.UXTB, R5
CINC EQ, R11, R11
CBNZ R1, tail
done:
MOVD R11, R0
RET
PCALIGN $16
head:
ANDS $0x1f, R0, R9
BEQ chunk
// Work with not 32-byte aligned head
BIC $0x1f, R0, R3
ADD $0x20, R3
PCALIGN $16
head_loop:
MOVBU.P 1(R0), R5
CMP R2.UXTB, R5
CINC EQ, R11, R11
SUB $1, R1, R1
CMP R0, R3
BNE head_loop
chunk:
BIC $0x1f, R1, R9
// The first chunk can also be the last
CBZ R9, tail
// R3 = end of 32-byte chunks
ADD R0, R9, R3
MOVD $1, R5
VMOV R5, V5.B16
// R1 = length of tail
SUB R9, R1, R1
// Duplicate R2 (byte to search) to 16 1-byte elements of V0
VMOV R2, V0.B16
// Clear the low 64-bit element of V7 and V8
VEOR V7.B8, V7.B8, V7.B8
VEOR V8.B8, V8.B8, V8.B8
PCALIGN $16
// Count the target byte in 32-byte chunk
chunk_loop:
VLD1.P (R0), [V1.B16, V2.B16]
CMP R0, R3
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
// Clear the higher 7 bits
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
// Count lanes match the requested byte
VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
VUADDLV V6.B16, V7
// Accumulate the count in low 64-bit element of V8 when inside the loop
VADD V7, V8
BNE chunk_loop
VMOV V8.D[0], R6
ADD R6, R11, R11
CBZ R1, done
B tail