Skip to content

Commit cfca343

Browse files
wdvxdr1123nhooyr
authored andcommitted
mask.go: Use SIMD masking for amd64 and arm64
goos: windows goarch: amd64 pkg: nhooyr.io/websocket cpu: Intel(R) Core(TM) i5-9300H CPU @ 2.40GHz Benchmark_mask/2/basic-8 425339004 2.795 ns/op 715.66 MB/s Benchmark_mask/2/nhooyr-8 379937766 3.186 ns/op 627.78 MB/s Benchmark_mask/2/gorilla-8 392164167 3.071 ns/op 651.24 MB/s Benchmark_mask/2/gobwas-8 310037222 3.880 ns/op 515.46 MB/s Benchmark_mask/3/basic-8 321408024 3.806 ns/op 788.32 MB/s Benchmark_mask/3/nhooyr-8 350726338 3.478 ns/op 862.58 MB/s Benchmark_mask/3/gorilla-8 332217727 3.634 ns/op 825.43 MB/s Benchmark_mask/3/gobwas-8 247376214 4.886 ns/op 614.01 MB/s Benchmark_mask/4/basic-8 261182472 4.582 ns/op 872.91 MB/s Benchmark_mask/4/nhooyr-8 381830712 3.262 ns/op 1226.05 MB/s Benchmark_mask/4/gorilla-8 272616304 4.395 ns/op 910.04 MB/s Benchmark_mask/4/gobwas-8 204574558 5.855 ns/op 683.19 MB/s Benchmark_mask/8/basic-8 191330037 6.162 ns/op 1298.24 MB/s Benchmark_mask/8/nhooyr-8 369694992 3.285 ns/op 2435.65 MB/s Benchmark_mask/8/gorilla-8 175388466 6.743 ns/op 1186.48 MB/s Benchmark_mask/8/gobwas-8 241719933 4.886 ns/op 1637.45 MB/s Benchmark_mask/16/basic-8 100000000 10.92 ns/op 1464.83 MB/s Benchmark_mask/16/nhooyr-8 272565096 4.436 ns/op 3606.98 MB/s Benchmark_mask/16/gorilla-8 100000000 11.20 ns/op 1428.53 MB/s Benchmark_mask/16/gobwas-8 221356798 5.405 ns/op 2960.45 MB/s Benchmark_mask/32/basic-8 61476984 20.40 ns/op 1568.80 MB/s Benchmark_mask/32/nhooyr-8 238665572 5.050 ns/op 6337.22 MB/s Benchmark_mask/32/gorilla-8 100000000 12.09 ns/op 2647.28 MB/s Benchmark_mask/32/gobwas-8 186077235 6.477 ns/op 4940.36 MB/s Benchmark_mask/128/basic-8 14629720 80.90 ns/op 1582.19 MB/s Benchmark_mask/128/nhooyr-8 181241968 6.565 ns/op 19497.98 MB/s Benchmark_mask/128/gorilla-8 68308342 16.76 ns/op 7639.37 MB/s Benchmark_mask/128/gobwas-8 94582026 12.97 ns/op 9872.11 MB/s Benchmark_mask/512/basic-8 3921001 305.6 ns/op 1675.55 MB/s Benchmark_mask/512/nhooyr-8 123102199 9.721 ns/op 52669.11 MB/s Benchmark_mask/512/gorilla-8 32355914 38.18 ns/op 13411.43 MB/s Benchmark_mask/512/gobwas-8 31528501 37.80 ns/op 13544.37 MB/s Benchmark_mask/4096/basic-8 491804 2381 ns/op 1720.39 MB/s Benchmark_mask/4096/nhooyr-8 26159691 46.98 ns/op 87187.73 MB/s Benchmark_mask/4096/gorilla-8 4898440 243.6 ns/op 16817.89 MB/s Benchmark_mask/4096/gobwas-8 4336398 277.2 ns/op 14776.40 MB/s Benchmark_mask/16384/basic-8 113842 9623 ns/op 1702.66 MB/s Benchmark_mask/16384/nhooyr-8 8088847 154.5 ns/op 106058.18 MB/s Benchmark_mask/16384/gorilla-8 1282993 933.6 ns/op 17549.90 MB/s Benchmark_mask/16384/gobwas-8 997347 1086 ns/op 15093.49 MB/s We're about 4-5x faster then gorilla now.
1 parent 535fd2c commit cfca343

File tree

7 files changed

+257
-1
lines changed

7 files changed

+257
-1
lines changed

frame.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) {
184184
// to be in little endian.
185185
//
186186
// See https://github.com/golang/go/issues/31586
187-
func mask(key uint32, b []byte) uint32 {
187+
func maskGo(key uint32, b []byte) uint32 {
188188
if len(b) >= 8 {
189189
key64 := uint64(key)<<32 | uint64(key)
190190

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
module nhooyr.io/websocket
22

33
go 1.19
4+
5+
require golang.org/x/sys v0.13.0

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
2+
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=

mask_amd64.s

+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#include "textflag.h"
2+
3+
// func maskAsm(b *byte, len int, key uint32)
4+
TEXT ·maskAsm(SB), NOSPLIT, $0-28
5+
// AX = b
6+
// CX = len (left length)
7+
// SI = key (uint32)
8+
// DI = uint64(SI) | uint64(SI)<<32
9+
MOVQ b+0(FP), AX
10+
MOVQ len+8(FP), CX
11+
MOVL key+16(FP), SI
12+
13+
// calculate the DI
14+
// DI = SI<<32 | SI
15+
MOVL SI, DI
16+
MOVQ DI, DX
17+
SHLQ $32, DI
18+
ORQ DX, DI
19+
20+
CMPQ CX, $15
21+
JLE less_than_16
22+
CMPQ CX, $63
23+
JLE less_than_64
24+
CMPQ CX, $128
25+
JLE sse
26+
TESTQ $31, AX
27+
JNZ unaligned
28+
29+
aligned:
30+
CMPB ·useAVX2(SB), $1
31+
JE avx2
32+
JMP sse
33+
34+
unaligned_loop_1byte:
35+
XORB SI, (AX)
36+
INCQ AX
37+
DECQ CX
38+
ROLL $24, SI
39+
TESTQ $7, AX
40+
JNZ unaligned_loop_1byte
41+
42+
// calculate DI again since SI was modified
43+
// DI = SI<<32 | SI
44+
MOVL SI, DI
45+
MOVQ DI, DX
46+
SHLQ $32, DI
47+
ORQ DX, DI
48+
49+
TESTQ $31, AX
50+
JZ aligned
51+
52+
unaligned:
53+
TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b.
54+
JNZ unaligned_loop_1byte
55+
56+
unaligned_loop:
57+
// we don't need to check the CX since we know it's above 128
58+
XORQ DI, (AX)
59+
ADDQ $8, AX
60+
SUBQ $8, CX
61+
TESTQ $31, AX
62+
JNZ unaligned_loop
63+
JMP aligned
64+
65+
avx2:
66+
CMPQ CX, $0x80
67+
JL sse
68+
VMOVQ DI, X0
69+
VPBROADCASTQ X0, Y0
70+
71+
avx2_loop:
72+
VPXOR (AX), Y0, Y1
73+
VPXOR 32(AX), Y0, Y2
74+
VPXOR 64(AX), Y0, Y3
75+
VPXOR 96(AX), Y0, Y4
76+
VMOVDQU Y1, (AX)
77+
VMOVDQU Y2, 32(AX)
78+
VMOVDQU Y3, 64(AX)
79+
VMOVDQU Y4, 96(AX)
80+
ADDQ $0x80, AX
81+
SUBQ $0x80, CX
82+
CMPQ CX, $0x80
83+
JAE avx2_loop // loop if CX >= 0x80
84+
85+
sse:
86+
CMPQ CX, $0x40
87+
JL less_than_64
88+
MOVQ DI, X0
89+
PUNPCKLQDQ X0, X0
90+
91+
sse_loop:
92+
MOVOU 0*16(AX), X1
93+
MOVOU 1*16(AX), X2
94+
MOVOU 2*16(AX), X3
95+
MOVOU 3*16(AX), X4
96+
PXOR X0, X1
97+
PXOR X0, X2
98+
PXOR X0, X3
99+
PXOR X0, X4
100+
MOVOU X1, 0*16(AX)
101+
MOVOU X2, 1*16(AX)
102+
MOVOU X3, 2*16(AX)
103+
MOVOU X4, 3*16(AX)
104+
ADDQ $0x40, AX
105+
SUBQ $0x40, CX
106+
CMPQ CX, $0x40
107+
JAE sse_loop
108+
109+
less_than_64:
110+
TESTQ $32, CX
111+
JZ less_than_32
112+
XORQ DI, (AX)
113+
XORQ DI, 8(AX)
114+
XORQ DI, 16(AX)
115+
XORQ DI, 24(AX)
116+
ADDQ $32, AX
117+
118+
less_than_32:
119+
TESTQ $16, CX
120+
JZ less_than_16
121+
XORQ DI, (AX)
122+
XORQ DI, 8(AX)
123+
ADDQ $16, AX
124+
125+
less_than_16:
126+
TESTQ $8, CX
127+
JZ less_than_8
128+
XORQ DI, (AX)
129+
ADDQ $8, AX
130+
131+
less_than_8:
132+
TESTQ $4, CX
133+
JZ less_than_4
134+
XORL SI, (AX)
135+
ADDQ $4, AX
136+
137+
less_than_4:
138+
TESTQ $2, CX
139+
JZ less_than_2
140+
XORW SI, (AX)
141+
ROLL $16, SI
142+
ADDQ $2, AX
143+
144+
less_than_2:
145+
TESTQ $1, CX
146+
JZ done
147+
XORB SI, (AX)
148+
ROLL $24, SI
149+
150+
done:
151+
MOVL SI, ret+24(FP)
152+
RET

mask_arm64.s

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include "textflag.h"
2+
3+
// func maskAsm(b *byte,len, int, key uint32)
4+
TEXT ·maskAsm(SB), NOSPLIT, $0-28
5+
// R0 = b
6+
// R1 = len
7+
// R2 = uint64(key)<<32 | uint64(key)
8+
// R3 = key (uint32)
9+
MOVD b_ptr+0(FP), R0
10+
MOVD b_len+8(FP), R1
11+
MOVWU key+16(FP), R3
12+
MOVD R3, R2
13+
ORR R2<<32, R2, R2
14+
VDUP R2, V0.D2
15+
CMP $64, R1
16+
BLT less_than_64
17+
18+
// todo: optimize unaligned case
19+
loop_64:
20+
VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16]
21+
VEOR V1.B16, V0.B16, V1.B16
22+
VEOR V2.B16, V0.B16, V2.B16
23+
VEOR V3.B16, V0.B16, V3.B16
24+
VEOR V4.B16, V0.B16, V4.B16
25+
VST1.P [V1.B16, V2.B16, V3.B16, V4.B16], 64(R0)
26+
SUBS $64, R1
27+
CMP $64, R1
28+
BGE loop_64
29+
30+
less_than_64:
31+
// quick end
32+
CBZ R1, end
33+
TBZ $5, R1, less_than32
34+
VLD1 (R0), [V1.B16, V2.B16]
35+
VEOR V1.B16, V0.B16, V1.B16
36+
VEOR V2.B16, V0.B16, V2.B16
37+
VST1.P [V1.B16, V2.B16], 32(R0)
38+
39+
less_than32:
40+
TBZ $4, R1, less_than16
41+
LDP (R0), (R11, R12)
42+
EOR R11, R2, R11
43+
EOR R12, R2, R12
44+
STP.P (R11, R12), 16(R0)
45+
46+
less_than16:
47+
TBZ $3, R1, less_than8
48+
MOVD (R0), R11
49+
EOR R2, R11, R11
50+
MOVD.P R11, 8(R0)
51+
52+
less_than8:
53+
TBZ $2, R1, less_than4
54+
MOVWU (R0), R11
55+
EORW R2, R11, R11
56+
MOVWU.P R11, 4(R0)
57+
58+
less_than4:
59+
TBZ $1, R1, less_than2
60+
MOVHU (R0), R11
61+
EORW R3, R11, R11
62+
MOVHU.P R11, 2(R0)
63+
RORW $16, R3
64+
65+
less_than2:
66+
TBZ $0, R1, end
67+
MOVBU (R0), R11
68+
EORW R3, R11, R11
69+
MOVBU.P R11, 1(R0)
70+
RORW $8, R3
71+
72+
end:
73+
MOVWU R3, ret+24(FP)
74+
RET

mask_asm.go

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//go:build !appengine && (amd64 || arm64)
2+
// +build !appengine
3+
// +build amd64 arm64
4+
5+
package websocket
6+
7+
import "golang.org/x/sys/cpu"
8+
9+
func mask(key uint32, b []byte) uint32 {
10+
if len(b) > 0 {
11+
return maskAsm(&b[0], len(b), key)
12+
}
13+
return key
14+
}
15+
16+
var useAVX2 = cpu.X86.HasAVX2
17+
18+
//go:noescape
19+
func maskAsm(b *byte, len int, key uint32) uint32

mask_generic.go

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
//go:build appengine || (!amd64 && !arm64 && !js)
2+
3+
package websocket
4+
5+
func mask(key uint32, b []byte) uint32 {
6+
return maskGo(key, b)
7+
}

0 commit comments

Comments
 (0)