From 2aeefc3f8a8174d7c23f7ec8e92ecebb47db0b49 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 3 Sep 2023 18:26:04 +0200 Subject: [PATCH 01/95] ssh: add support for SSH_AGENT_CONSTRAIN_EXTENSION with id 255 it was changed in the following draft https://datatracker.ietf.org/doc/html/draft-miller-ssh-agent-03 The id 3 is now used for SSH_AGENT_CONSTRAIN_MAXSIGN key constraint, an OpenSSH extension to the protocol that we do not currently support. Instead, we added a compatibility layer for SSH_AGENT_CONSTRAIN_EXTENSION with ID 3. Fixes golang/go#62311 Change-Id: I421aee92aee9e693e43f66e6a5515c055333cb9b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/525355 Reviewed-by: Matthew Dempsky Run-TryBot: Nicola Murino Reviewed-by: Filippo Valsorda TryBot-Result: Gopher Robot Reviewed-by: Than McIntosh --- ssh/agent/client.go | 13 +++++++++---- ssh/agent/server.go | 2 +- ssh/agent/server_test.go | 6 +++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ssh/agent/client.go b/ssh/agent/client.go index 9f09aae7dd..fecba8eb38 100644 --- a/ssh/agent/client.go +++ b/ssh/agent/client.go @@ -141,9 +141,14 @@ const ( agentAddSmartcardKeyConstrained = 26 // 3.7 Key constraint identifiers - agentConstrainLifetime = 1 - agentConstrainConfirm = 2 - agentConstrainExtension = 3 + agentConstrainLifetime = 1 + agentConstrainConfirm = 2 + // Constraint extension identifier up to version 2 of the protocol. A + // backward incompatible change will be required if we want to add support + // for SSH_AGENT_CONSTRAIN_MAXSIGN which uses the same ID. + agentConstrainExtensionV00 = 3 + // Constraint extension identifier in version 3 and later of the protocol. + agentConstrainExtension = 255 ) // maxAgentResponseBytes is the maximum agent reply size that is accepted. This @@ -205,7 +210,7 @@ type constrainLifetimeAgentMsg struct { } type constrainExtensionAgentMsg struct { - ExtensionName string `sshtype:"3"` + ExtensionName string `sshtype:"255|3"` ExtensionDetails []byte // Rest is a field used for parsing, not part of message diff --git a/ssh/agent/server.go b/ssh/agent/server.go index dd2e0a3e71..e35ca7ce31 100644 --- a/ssh/agent/server.go +++ b/ssh/agent/server.go @@ -208,7 +208,7 @@ func parseConstraints(constraints []byte) (lifetimeSecs uint32, confirmBeforeUse case agentConstrainConfirm: confirmBeforeUse = true constraints = constraints[1:] - case agentConstrainExtension: + case agentConstrainExtension, agentConstrainExtensionV00: var msg constrainExtensionAgentMsg if err = ssh.Unmarshal(constraints, &msg); err != nil { return 0, false, nil, err diff --git a/ssh/agent/server_test.go b/ssh/agent/server_test.go index 0af85457e4..7700d18f1a 100644 --- a/ssh/agent/server_test.go +++ b/ssh/agent/server_test.go @@ -243,7 +243,11 @@ func TestParseConstraints(t *testing.T) { ExtensionDetails: []byte(fmt.Sprintf("details: %d", i)), } expect = append(expect, ext) - data = append(data, agentConstrainExtension) + if i%2 == 0 { + data = append(data, agentConstrainExtension) + } else { + data = append(data, agentConstrainExtensionV00) + } data = append(data, ssh.Marshal(ext)...) } _, _, extensions, err := parseConstraints(data) From 8779cbd1c99580969b3c12f0c828ea52fbb9a78c Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Tue, 10 Oct 2023 17:49:56 -0400 Subject: [PATCH 02/95] all: update go directive to 1.18 Done with: go get go@1.18 go mod tidy go fix ./... Using go1.21.3. Also update avo to v0.5.0 in the curve25519/internal/field/_asm module. It's newer and produces no diff in the generated code. For golang/go#60268. Change-Id: I9bd771ee8561595d7f68aaca76df6e3e33d35013 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/534141 LUCI-TryBot-Result: Go LUCI Reviewed-by: Damien Neil Auto-Submit: Dmitri Shuralyov Reviewed-by: Dmitri Shuralyov --- acme/version_go112.go | 1 - argon2/blamka_amd64.go | 1 - argon2/blamka_amd64.s | 1 - argon2/blamka_ref.go | 1 - blake2b/blake2bAVX2_amd64.go | 1 - blake2b/blake2bAVX2_amd64.s | 1 - blake2b/blake2b_amd64.go | 1 - blake2b/blake2b_amd64.s | 1 - blake2b/blake2b_ref.go | 1 - blake2b/register.go | 1 - blake2s/blake2s_386.go | 1 - blake2s/blake2s_386.s | 1 - blake2s/blake2s_amd64.go | 1 - blake2s/blake2s_amd64.s | 1 - blake2s/blake2s_ref.go | 1 - blake2s/register.go | 1 - chacha20/chacha_arm64.go | 1 - chacha20/chacha_arm64.s | 1 - chacha20/chacha_noasm.go | 1 - chacha20/chacha_ppc64le.go | 1 - chacha20/chacha_ppc64le.s | 1 - chacha20/chacha_s390x.go | 1 - chacha20/chacha_s390x.s | 1 - chacha20poly1305/chacha20poly1305_amd64.go | 1 - chacha20poly1305/chacha20poly1305_amd64.s | 1 - chacha20poly1305/chacha20poly1305_noasm.go | 1 - curve25519/internal/field/_asm/go.mod | 14 +++-- curve25519/internal/field/_asm/go.sum | 61 +++++++++++++-------- curve25519/internal/field/fe_amd64.go | 1 - curve25519/internal/field/fe_amd64.s | 1 - curve25519/internal/field/fe_amd64_noasm.go | 1 - curve25519/internal/field/fe_arm64.go | 1 - curve25519/internal/field/fe_arm64.s | 1 - curve25519/internal/field/fe_arm64_noasm.go | 1 - go.mod | 2 +- go.sum | 33 ----------- internal/alias/alias.go | 1 - internal/alias/alias_purego.go | 1 - internal/poly1305/bits_compat.go | 1 - internal/poly1305/bits_go1.13.go | 1 - internal/poly1305/mac_noasm.go | 1 - internal/poly1305/sum_amd64.go | 1 - internal/poly1305/sum_amd64.s | 1 - internal/poly1305/sum_ppc64le.go | 1 - internal/poly1305/sum_ppc64le.s | 1 - internal/poly1305/sum_s390x.go | 1 - internal/poly1305/sum_s390x.s | 1 - internal/wycheproof/eddsa_test.go | 1 - ocsp/ocsp_test.go | 1 - otr/libotr_test_helper.c | 2 +- salsa20/salsa/salsa20_amd64.go | 1 - salsa20/salsa/salsa20_amd64.s | 1 - salsa20/salsa/salsa20_amd64_test.go | 1 - salsa20/salsa/salsa20_noasm.go | 1 - sha3/hashes_generic.go | 1 - sha3/keccakf.go | 1 - sha3/keccakf_amd64.go | 1 - sha3/keccakf_amd64.s | 1 - sha3/register.go | 1 - sha3/sha3_s390x.go | 1 - sha3/sha3_s390x.s | 1 - sha3/shake_generic.go | 1 - sha3/xor.go | 1 - sha3/xor_unaligned.go | 2 - ssh/test/agent_unix_test.go | 1 - ssh/test/banner_test.go | 1 - ssh/test/cert_test.go | 1 - ssh/test/dial_unix_test.go | 1 - ssh/test/forward_unix_test.go | 1 - ssh/test/multi_auth_test.go | 1 - ssh/test/session_test.go | 1 - ssh/test/sshd_test_pw.c | 2 +- ssh/test/test_unix_test.go | 1 - x509roots/gen_fallback_bundle.go | 1 - 74 files changed, 52 insertions(+), 131 deletions(-) diff --git a/acme/version_go112.go b/acme/version_go112.go index b9efdb59e5..cc5fab604b 100644 --- a/acme/version_go112.go +++ b/acme/version_go112.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.12 -// +build go1.12 package acme diff --git a/argon2/blamka_amd64.go b/argon2/blamka_amd64.go index a014ac92aa..063e7784f8 100644 --- a/argon2/blamka_amd64.go +++ b/argon2/blamka_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && gc && !purego -// +build amd64,gc,!purego package argon2 diff --git a/argon2/blamka_amd64.s b/argon2/blamka_amd64.s index b2cc051504..f3b653a12f 100644 --- a/argon2/blamka_amd64.s +++ b/argon2/blamka_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && gc && !purego -// +build amd64,gc,!purego #include "textflag.h" diff --git a/argon2/blamka_ref.go b/argon2/blamka_ref.go index 167c59d2d5..16d58c650e 100644 --- a/argon2/blamka_ref.go +++ b/argon2/blamka_ref.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !amd64 || purego || !gc -// +build !amd64 purego !gc package argon2 diff --git a/blake2b/blake2bAVX2_amd64.go b/blake2b/blake2bAVX2_amd64.go index 56bfaaa17d..4f506f8791 100644 --- a/blake2b/blake2bAVX2_amd64.go +++ b/blake2b/blake2bAVX2_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.7 && amd64 && gc && !purego -// +build go1.7,amd64,gc,!purego package blake2b diff --git a/blake2b/blake2bAVX2_amd64.s b/blake2b/blake2bAVX2_amd64.s index 4b9daa18d9..353bb7cac5 100644 --- a/blake2b/blake2bAVX2_amd64.s +++ b/blake2b/blake2bAVX2_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.7 && amd64 && gc && !purego -// +build go1.7,amd64,gc,!purego #include "textflag.h" diff --git a/blake2b/blake2b_amd64.go b/blake2b/blake2b_amd64.go index 5fa1b32841..1d0770abba 100644 --- a/blake2b/blake2b_amd64.go +++ b/blake2b/blake2b_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !go1.7 && amd64 && gc && !purego -// +build !go1.7,amd64,gc,!purego package blake2b diff --git a/blake2b/blake2b_amd64.s b/blake2b/blake2b_amd64.s index ae75eb9afc..adfac00c15 100644 --- a/blake2b/blake2b_amd64.s +++ b/blake2b/blake2b_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && gc && !purego -// +build amd64,gc,!purego #include "textflag.h" diff --git a/blake2b/blake2b_ref.go b/blake2b/blake2b_ref.go index b0137cdf02..6e28668cd1 100644 --- a/blake2b/blake2b_ref.go +++ b/blake2b/blake2b_ref.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !amd64 || purego || !gc -// +build !amd64 purego !gc package blake2b diff --git a/blake2b/register.go b/blake2b/register.go index 9d8633963c..d9fcac3a4d 100644 --- a/blake2b/register.go +++ b/blake2b/register.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.9 -// +build go1.9 package blake2b diff --git a/blake2s/blake2s_386.go b/blake2s/blake2s_386.go index b4463fb4dc..97f629617e 100644 --- a/blake2s/blake2s_386.go +++ b/blake2s/blake2s_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 && gc && !purego -// +build 386,gc,!purego package blake2s diff --git a/blake2s/blake2s_386.s b/blake2s/blake2s_386.s index 603d00ca32..919c026541 100644 --- a/blake2s/blake2s_386.s +++ b/blake2s/blake2s_386.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 && gc && !purego -// +build 386,gc,!purego #include "textflag.h" diff --git a/blake2s/blake2s_amd64.go b/blake2s/blake2s_amd64.go index becdaa120f..8a7310254e 100644 --- a/blake2s/blake2s_amd64.go +++ b/blake2s/blake2s_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && gc && !purego -// +build amd64,gc,!purego package blake2s diff --git a/blake2s/blake2s_amd64.s b/blake2s/blake2s_amd64.s index e9df7a7c21..fe4b818a33 100644 --- a/blake2s/blake2s_amd64.s +++ b/blake2s/blake2s_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && gc && !purego -// +build amd64,gc,!purego #include "textflag.h" diff --git a/blake2s/blake2s_ref.go b/blake2s/blake2s_ref.go index 799dba0c41..38ce8e283f 100644 --- a/blake2s/blake2s_ref.go +++ b/blake2s/blake2s_ref.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build (!amd64 && !386) || !gc || purego -// +build !amd64,!386 !gc purego package blake2s diff --git a/blake2s/register.go b/blake2s/register.go index ef79ff3c67..3156148a42 100644 --- a/blake2s/register.go +++ b/blake2s/register.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.9 -// +build go1.9 package blake2s diff --git a/chacha20/chacha_arm64.go b/chacha20/chacha_arm64.go index 5dfacbb983..661ea132e0 100644 --- a/chacha20/chacha_arm64.go +++ b/chacha20/chacha_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package chacha20 diff --git a/chacha20/chacha_arm64.s b/chacha20/chacha_arm64.s index f1f66230d1..7dd2638e88 100644 --- a/chacha20/chacha_arm64.s +++ b/chacha20/chacha_arm64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego #include "textflag.h" diff --git a/chacha20/chacha_noasm.go b/chacha20/chacha_noasm.go index 02ff3d05e9..db42e6676a 100644 --- a/chacha20/chacha_noasm.go +++ b/chacha20/chacha_noasm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build (!arm64 && !s390x && !ppc64le) || !gc || purego -// +build !arm64,!s390x,!ppc64le !gc purego package chacha20 diff --git a/chacha20/chacha_ppc64le.go b/chacha20/chacha_ppc64le.go index da420b2e97..3a4287f990 100644 --- a/chacha20/chacha_ppc64le.go +++ b/chacha20/chacha_ppc64le.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package chacha20 diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64le.s index 5c0fed26f8..66aebae258 100644 --- a/chacha20/chacha_ppc64le.s +++ b/chacha20/chacha_ppc64le.s @@ -20,7 +20,6 @@ // due to the calling conventions and initialization of constants. //go:build gc && !purego -// +build gc,!purego #include "textflag.h" diff --git a/chacha20/chacha_s390x.go b/chacha20/chacha_s390x.go index 4652247b8a..683ccfd1c3 100644 --- a/chacha20/chacha_s390x.go +++ b/chacha20/chacha_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package chacha20 diff --git a/chacha20/chacha_s390x.s b/chacha20/chacha_s390x.s index f3ef5a019d..1eda91a3d4 100644 --- a/chacha20/chacha_s390x.s +++ b/chacha20/chacha_s390x.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego #include "go_asm.h" #include "textflag.h" diff --git a/chacha20poly1305/chacha20poly1305_amd64.go b/chacha20poly1305/chacha20poly1305_amd64.go index 0c408c5709..50695a14f6 100644 --- a/chacha20poly1305/chacha20poly1305_amd64.go +++ b/chacha20poly1305/chacha20poly1305_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package chacha20poly1305 diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s index 867c181a14..541d696b67 100644 --- a/chacha20poly1305/chacha20poly1305_amd64.s +++ b/chacha20poly1305/chacha20poly1305_amd64.s @@ -5,7 +5,6 @@ // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. //go:build gc && !purego -// +build gc,!purego #include "textflag.h" // General register allocation diff --git a/chacha20poly1305/chacha20poly1305_noasm.go b/chacha20poly1305/chacha20poly1305_noasm.go index f832b33d45..34e6ab1df8 100644 --- a/chacha20poly1305/chacha20poly1305_noasm.go +++ b/chacha20poly1305/chacha20poly1305_noasm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !amd64 || !gc || purego -// +build !amd64 !gc purego package chacha20poly1305 diff --git a/curve25519/internal/field/_asm/go.mod b/curve25519/internal/field/_asm/go.mod index 5b3dba315e..bf6dbc73cd 100644 --- a/curve25519/internal/field/_asm/go.mod +++ b/curve25519/internal/field/_asm/go.mod @@ -1,10 +1,16 @@ module asm -go 1.16 +go 1.18 require ( - github.com/mmcloughlin/avo v0.4.0 - golang.org/x/crypto v0.0.0 + github.com/mmcloughlin/avo v0.5.0 + golang.org/x/crypto v0.1.0 ) -replace golang.org/x/crypto v0.0.0 => ../../../.. +require ( + golang.org/x/mod v0.8.0 // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/tools v0.6.0 // indirect +) + +replace golang.org/x/crypto v0.1.0 => ../../../.. diff --git a/curve25519/internal/field/_asm/go.sum b/curve25519/internal/field/_asm/go.sum index 4c9bbf675b..3f57ad913c 100644 --- a/curve25519/internal/field/_asm/go.sum +++ b/curve25519/internal/field/_asm/go.sum @@ -1,34 +1,51 @@ -github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU= -github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s= -github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= +github.com/mmcloughlin/avo v0.5.0 h1:nAco9/aI9Lg2kiuROBY6BhCI/z0t5jEvJfjWbL8qXLU= +github.com/mmcloughlin/avo v0.5.0/go.mod h1:ChHFdoV7ql95Wi7vuq2YT1bwCJqiWdZrQ1im3VujLYM= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/arch v0.1.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo= -golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.6.0/go.mod h1:4mET923SAdbXp2ki8ey+zGs1SLqsuM2Y0uvdZR/fUNI= +golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk= -golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ= -golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.2.0/go.mod h1:y4OqIKeOV/fWJetJ8bXPU1sEVniLMIyDAZWeHdV+NTA= +golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/curve25519/internal/field/fe_amd64.go b/curve25519/internal/field/fe_amd64.go index edcf163c4e..70c541692c 100644 --- a/curve25519/internal/field/fe_amd64.go +++ b/curve25519/internal/field/fe_amd64.go @@ -1,7 +1,6 @@ // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. //go:build amd64 && gc && !purego -// +build amd64,gc,!purego package field diff --git a/curve25519/internal/field/fe_amd64.s b/curve25519/internal/field/fe_amd64.s index 293f013c94..60817acc41 100644 --- a/curve25519/internal/field/fe_amd64.s +++ b/curve25519/internal/field/fe_amd64.s @@ -1,7 +1,6 @@ // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. //go:build amd64 && gc && !purego -// +build amd64,gc,!purego #include "textflag.h" diff --git a/curve25519/internal/field/fe_amd64_noasm.go b/curve25519/internal/field/fe_amd64_noasm.go index ddb6c9b8f7..9da280d1d8 100644 --- a/curve25519/internal/field/fe_amd64_noasm.go +++ b/curve25519/internal/field/fe_amd64_noasm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !amd64 || !gc || purego -// +build !amd64 !gc purego package field diff --git a/curve25519/internal/field/fe_arm64.go b/curve25519/internal/field/fe_arm64.go index af459ef515..075fe9b925 100644 --- a/curve25519/internal/field/fe_arm64.go +++ b/curve25519/internal/field/fe_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm64 && gc && !purego -// +build arm64,gc,!purego package field diff --git a/curve25519/internal/field/fe_arm64.s b/curve25519/internal/field/fe_arm64.s index 5c91e45892..3126a43419 100644 --- a/curve25519/internal/field/fe_arm64.s +++ b/curve25519/internal/field/fe_arm64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm64 && gc && !purego -// +build arm64,gc,!purego #include "textflag.h" diff --git a/curve25519/internal/field/fe_arm64_noasm.go b/curve25519/internal/field/fe_arm64_noasm.go index 234a5b2e5d..fc029ac12d 100644 --- a/curve25519/internal/field/fe_arm64_noasm.go +++ b/curve25519/internal/field/fe_arm64_noasm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !arm64 || !gc || purego -// +build !arm64 !gc purego package field diff --git a/go.mod b/go.mod index 97cbcb6929..c5c2f6d9ea 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module golang.org/x/crypto -go 1.17 +go 1.18 require ( golang.org/x/net v0.10.0 // tagx:ignore diff --git a/go.sum b/go.sum index f46a99b380..b99025deda 100644 --- a/go.sum +++ b/go.sum @@ -1,41 +1,8 @@ -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/internal/alias/alias.go b/internal/alias/alias.go index 69c17f822b..551ff0c353 100644 --- a/internal/alias/alias.go +++ b/internal/alias/alias.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !purego -// +build !purego // Package alias implements memory aliasing tests. package alias diff --git a/internal/alias/alias_purego.go b/internal/alias/alias_purego.go index 4775b0a438..6fe61b5c6e 100644 --- a/internal/alias/alias_purego.go +++ b/internal/alias/alias_purego.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build purego -// +build purego // Package alias implements memory aliasing tests. package alias diff --git a/internal/poly1305/bits_compat.go b/internal/poly1305/bits_compat.go index 45b5c966b2..d33c8890fc 100644 --- a/internal/poly1305/bits_compat.go +++ b/internal/poly1305/bits_compat.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !go1.13 -// +build !go1.13 package poly1305 diff --git a/internal/poly1305/bits_go1.13.go b/internal/poly1305/bits_go1.13.go index ed52b3418a..495c1fa697 100644 --- a/internal/poly1305/bits_go1.13.go +++ b/internal/poly1305/bits_go1.13.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.13 -// +build go1.13 package poly1305 diff --git a/internal/poly1305/mac_noasm.go b/internal/poly1305/mac_noasm.go index f184b67d98..333da285b3 100644 --- a/internal/poly1305/mac_noasm.go +++ b/internal/poly1305/mac_noasm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build (!amd64 && !ppc64le && !s390x) || !gc || purego -// +build !amd64,!ppc64le,!s390x !gc purego package poly1305 diff --git a/internal/poly1305/sum_amd64.go b/internal/poly1305/sum_amd64.go index 6d522333f2..164cd47d32 100644 --- a/internal/poly1305/sum_amd64.go +++ b/internal/poly1305/sum_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package poly1305 diff --git a/internal/poly1305/sum_amd64.s b/internal/poly1305/sum_amd64.s index 1d74f0f881..e0d3c64756 100644 --- a/internal/poly1305/sum_amd64.s +++ b/internal/poly1305/sum_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego #include "textflag.h" diff --git a/internal/poly1305/sum_ppc64le.go b/internal/poly1305/sum_ppc64le.go index 4a069941a6..4aec4874b5 100644 --- a/internal/poly1305/sum_ppc64le.go +++ b/internal/poly1305/sum_ppc64le.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package poly1305 diff --git a/internal/poly1305/sum_ppc64le.s b/internal/poly1305/sum_ppc64le.s index 58422aad23..d2ca5deeb9 100644 --- a/internal/poly1305/sum_ppc64le.s +++ b/internal/poly1305/sum_ppc64le.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego #include "textflag.h" diff --git a/internal/poly1305/sum_s390x.go b/internal/poly1305/sum_s390x.go index ec95966889..e1d033a491 100644 --- a/internal/poly1305/sum_s390x.go +++ b/internal/poly1305/sum_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package poly1305 diff --git a/internal/poly1305/sum_s390x.s b/internal/poly1305/sum_s390x.s index aa9e0494c9..0fe3a7c217 100644 --- a/internal/poly1305/sum_s390x.s +++ b/internal/poly1305/sum_s390x.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego #include "textflag.h" diff --git a/internal/wycheproof/eddsa_test.go b/internal/wycheproof/eddsa_test.go index 118dfc043f..9b97490955 100644 --- a/internal/wycheproof/eddsa_test.go +++ b/internal/wycheproof/eddsa_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.13 -// +build go1.13 package wycheproof diff --git a/ocsp/ocsp_test.go b/ocsp/ocsp_test.go index 0bc194b2f4..a23d29bd1f 100644 --- a/ocsp/ocsp_test.go +++ b/ocsp/ocsp_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.7 -// +build go1.7 package ocsp diff --git a/otr/libotr_test_helper.c b/otr/libotr_test_helper.c index b3ca072d48..aae03a3df7 100644 --- a/otr/libotr_test_helper.c +++ b/otr/libotr_test_helper.c @@ -5,7 +5,7 @@ // This code can be compiled and used to test the otr package against libotr. // See otr_test.go. -// +build ignore +//go:build ignore #include #include diff --git a/salsa20/salsa/salsa20_amd64.go b/salsa20/salsa/salsa20_amd64.go index c400dfcf7b..e76b44fe59 100644 --- a/salsa20/salsa/salsa20_amd64.go +++ b/salsa20/salsa/salsa20_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && !purego && gc -// +build amd64,!purego,gc package salsa diff --git a/salsa20/salsa/salsa20_amd64.s b/salsa20/salsa/salsa20_amd64.s index c089277204..fcce0234b6 100644 --- a/salsa20/salsa/salsa20_amd64.s +++ b/salsa20/salsa/salsa20_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && !purego && gc -// +build amd64,!purego,gc // This code was translated into a form compatible with 6a from the public // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html diff --git a/salsa20/salsa/salsa20_amd64_test.go b/salsa20/salsa/salsa20_amd64_test.go index fc781f76f9..fe14604fc9 100644 --- a/salsa20/salsa/salsa20_amd64_test.go +++ b/salsa20/salsa/salsa20_amd64_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && !purego && gc -// +build amd64,!purego,gc package salsa diff --git a/salsa20/salsa/salsa20_noasm.go b/salsa20/salsa/salsa20_noasm.go index 4392cc1ac7..9448760f26 100644 --- a/salsa20/salsa/salsa20_noasm.go +++ b/salsa20/salsa/salsa20_noasm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !amd64 || purego || !gc -// +build !amd64 purego !gc package salsa diff --git a/sha3/hashes_generic.go b/sha3/hashes_generic.go index c74fc20fcb..fe8c84793c 100644 --- a/sha3/hashes_generic.go +++ b/sha3/hashes_generic.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !gc || purego || !s390x -// +build !gc purego !s390x package sha3 diff --git a/sha3/keccakf.go b/sha3/keccakf.go index e5faa375c0..ce48b1dd3e 100644 --- a/sha3/keccakf.go +++ b/sha3/keccakf.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !amd64 || purego || !gc -// +build !amd64 purego !gc package sha3 diff --git a/sha3/keccakf_amd64.go b/sha3/keccakf_amd64.go index 248a38241f..b908696be5 100644 --- a/sha3/keccakf_amd64.go +++ b/sha3/keccakf_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && !purego && gc -// +build amd64,!purego,gc package sha3 diff --git a/sha3/keccakf_amd64.s b/sha3/keccakf_amd64.s index 4cfa54383b..8fb26aebb2 100644 --- a/sha3/keccakf_amd64.s +++ b/sha3/keccakf_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && !purego && gc -// +build amd64,!purego,gc // This code was translated into a form compatible with 6a from the public // domain sources at https://github.com/gvanas/KeccakCodePackage diff --git a/sha3/register.go b/sha3/register.go index 8b4453aac3..addfd5049b 100644 --- a/sha3/register.go +++ b/sha3/register.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.4 -// +build go1.4 package sha3 diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go index ec26f147ff..d861bca528 100644 --- a/sha3/sha3_s390x.go +++ b/sha3/sha3_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego package sha3 diff --git a/sha3/sha3_s390x.s b/sha3/sha3_s390x.s index a0e051b045..826b862c77 100644 --- a/sha3/sha3_s390x.s +++ b/sha3/sha3_s390x.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc && !purego -// +build gc,!purego #include "textflag.h" diff --git a/sha3/shake_generic.go b/sha3/shake_generic.go index 5c0710ef98..8d31cf5be2 100644 --- a/sha3/shake_generic.go +++ b/sha3/shake_generic.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !gc || purego || !s390x -// +build !gc purego !s390x package sha3 diff --git a/sha3/xor.go b/sha3/xor.go index 59c8eb9418..7337cca88e 100644 --- a/sha3/xor.go +++ b/sha3/xor.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build (!amd64 && !386 && !ppc64le) || purego -// +build !amd64,!386,!ppc64le purego package sha3 diff --git a/sha3/xor_unaligned.go b/sha3/xor_unaligned.go index 1ce606246d..870e2d16e0 100644 --- a/sha3/xor_unaligned.go +++ b/sha3/xor_unaligned.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (amd64 || 386 || ppc64le) && !purego -// +build amd64 386 ppc64le -// +build !purego package sha3 diff --git a/ssh/test/agent_unix_test.go b/ssh/test/agent_unix_test.go index 43fbdb22eb..aeecaebb1d 100644 --- a/ssh/test/agent_unix_test.go +++ b/ssh/test/agent_unix_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris package test diff --git a/ssh/test/banner_test.go b/ssh/test/banner_test.go index 3bfdd4b059..dd17db5a87 100644 --- a/ssh/test/banner_test.go +++ b/ssh/test/banner_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris package test diff --git a/ssh/test/cert_test.go b/ssh/test/cert_test.go index 83dd534c5c..15c44fb58e 100644 --- a/ssh/test/cert_test.go +++ b/ssh/test/cert_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris package test diff --git a/ssh/test/dial_unix_test.go b/ssh/test/dial_unix_test.go index 4a7ec31737..0a5f5e395f 100644 --- a/ssh/test/dial_unix_test.go +++ b/ssh/test/dial_unix_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !windows && !js && !wasip1 -// +build !windows,!js,!wasip1 package test diff --git a/ssh/test/forward_unix_test.go b/ssh/test/forward_unix_test.go index 1171bc3a14..611122b9e8 100644 --- a/ssh/test/forward_unix_test.go +++ b/ssh/test/forward_unix_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris package test diff --git a/ssh/test/multi_auth_test.go b/ssh/test/multi_auth_test.go index 403d7363ab..14cf1cce12 100644 --- a/ssh/test/multi_auth_test.go +++ b/ssh/test/multi_auth_test.go @@ -15,7 +15,6 @@ // (for linux) in file ./sshd_test_pw.c. //go:build linux -// +build linux package test diff --git a/ssh/test/session_test.go b/ssh/test/session_test.go index 4745ed9dbb..69c8b86dd5 100644 --- a/ssh/test/session_test.go +++ b/ssh/test/session_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !windows && !js && !wasip1 -// +build !windows,!js,!wasip1 package test diff --git a/ssh/test/sshd_test_pw.c b/ssh/test/sshd_test_pw.c index 2794a563a4..1e48619994 100644 --- a/ssh/test/sshd_test_pw.c +++ b/ssh/test/sshd_test_pw.c @@ -20,7 +20,7 @@ // Run sshd: // LD_PRELOAD="sshd_test_pw.so" TEST_USER="..." TEST_PASSWD="..." sshd ... -// +build ignore +//go:build ignore #define _GNU_SOURCE #include diff --git a/ssh/test/test_unix_test.go b/ssh/test/test_unix_test.go index f3f55db128..8dbedb0b13 100644 --- a/ssh/test/test_unix_test.go +++ b/ssh/test/test_unix_test.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || plan9 || solaris -// +build aix darwin dragonfly freebsd linux netbsd openbsd plan9 solaris package test diff --git a/x509roots/gen_fallback_bundle.go b/x509roots/gen_fallback_bundle.go index c22d1b0c38..80b2044d22 100644 --- a/x509roots/gen_fallback_bundle.go +++ b/x509roots/gen_fallback_bundle.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build generate -// +build generate //go:generate go run gen_fallback_bundle.go From 1d5729261878487fa61508b204b9ad99e436f11b Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Fri, 20 Oct 2023 14:47:21 -0400 Subject: [PATCH 03/95] x509roots: check HTTP response status code and media type The HTTP response status code is expected to be 200 OK, and the certdata.txt file media type is expected to be plain text. Check that it is before proceeding with parsing it. Might help avoid repeats of CL 535735. Change-Id: I1a7896b3e20d33a23fdc53c572ae9700c9eae1ef Reviewed-on: https://go-review.googlesource.com/c/crypto/+/536717 LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Commit-Queue: Roland Shoemaker Reviewed-by: Roland Shoemaker Auto-Submit: Roland Shoemaker --- x509roots/gen_fallback_bundle.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/x509roots/gen_fallback_bundle.go b/x509roots/gen_fallback_bundle.go index 80b2044d22..ec3014ff63 100644 --- a/x509roots/gen_fallback_bundle.go +++ b/x509roots/gen_fallback_bundle.go @@ -17,6 +17,7 @@ import ( "go/format" "io" "log" + "mime" "net/http" "os" "sort" @@ -86,6 +87,16 @@ func main() { log.Fatalf("failed to request %q: %s", *certDataURL, err) } defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4<<10)) + log.Fatalf("got non-200 OK status code: %v body: %q", resp.Status, body) + } else if ct, want := resp.Header.Get("Content-Type"), `text/plain; charset="UTF-8"`; ct != want { + if mediaType, _, err := mime.ParseMediaType(ct); err != nil { + log.Fatalf("bad Content-Type header %q: %v", ct, err) + } else if mediaType != "text/plain" { + log.Fatalf("got media type %q, want %q", mediaType, "text/plain") + } + } certdata = resp.Body } From 4f3024555250c59d8b3c0da3f1cf1725c58f809c Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Wed, 18 Oct 2023 14:26:33 -0400 Subject: [PATCH 04/95] x509roots: catch the zero-roots case when generating the bundle If the parser returns zero roots, don't attempt to completely remove the bundle. This may happen if, i.e., the HTTP response is 200 but has no content. An example of this may be http://go.dev/cl/535735. Change-Id: I81fc2b49c8ec813cca17fd1c807296bfb053d992 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/536136 Reviewed-by: Damien Neil LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Roland Shoemaker --- x509roots/gen_fallback_bundle.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/x509roots/gen_fallback_bundle.go b/x509roots/gen_fallback_bundle.go index ec3014ff63..ffea49b1e8 100644 --- a/x509roots/gen_fallback_bundle.go +++ b/x509roots/gen_fallback_bundle.go @@ -105,6 +105,10 @@ func main() { log.Fatalf("failed to parse %q: %s", *certDataPath, err) } + if len(certs) == 0 { + log.Fatal("certdata.txt appears to contain zero roots") + } + sort.Slice(certs, func(i, j int) bool { // Sort based on the stringified subject (which may not be unique), and // break any ties by just sorting on the raw DER (which will be unique, From cf8dcb0f7d1e4e345ca9df755538650a5e9eb47c Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Wed, 28 Jun 2023 18:30:13 +0200 Subject: [PATCH 05/95] ssh: add test case against ssh CLI These tests try to ensure better compatibility of our server implementation with the ssh CLI. With these tests in place: 1) before merging CL 447757 we would have noticed that our server implementation was broken with OpenSSH 8.8+ 2) after merging CL 447757 we would have noticed that our server implementation was broken with OpenSSH 7.2-7.7 The ssh CLI from $PATH is used by default, but can be overridden using the SSH_CLI_PATH environment variable. Change-Id: I93d64be41c7613132b0364afac8397f57c2dcbca Reviewed-on: https://go-review.googlesource.com/c/crypto/+/506837 TryBot-Result: Gopher Robot Reviewed-by: Benny Siegert Reviewed-by: Han-Wen Nienhuys Run-TryBot: Nicola Murino --- ssh/test/server_test.go | 98 +++++++++++++++++++++++++++++++++++++++++ ssh/test/sshcli_test.go | 96 ++++++++++++++++++++++++++++++++++++++++ ssh/testdata/keys.go | 6 +++ 3 files changed, 200 insertions(+) create mode 100644 ssh/test/server_test.go create mode 100644 ssh/test/sshcli_test.go diff --git a/ssh/test/server_test.go b/ssh/test/server_test.go new file mode 100644 index 0000000000..e70241bc6e --- /dev/null +++ b/ssh/test/server_test.go @@ -0,0 +1,98 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package test + +import ( + "net" + + "golang.org/x/crypto/ssh" +) + +type exitStatusMsg struct { + Status uint32 +} + +// goServer is a test Go SSH server that accepts public key and certificate +// authentication and replies with a 0 exit status to any exec request without +// running any commands. +type goTestServer struct { + listener net.Listener + config *ssh.ServerConfig + done <-chan struct{} +} + +func newTestServer(config *ssh.ServerConfig) (*goTestServer, error) { + server := &goTestServer{ + config: config, + } + listener, err := net.Listen("tcp", "127.0.0.1:") + if err != nil { + return nil, err + } + server.listener = listener + done := make(chan struct{}, 1) + server.done = done + go server.acceptConnections(done) + + return server, nil +} + +func (s *goTestServer) port() (string, error) { + _, port, err := net.SplitHostPort(s.listener.Addr().String()) + return port, err +} + +func (s *goTestServer) acceptConnections(done chan<- struct{}) { + defer close(done) + + for { + c, err := s.listener.Accept() + if err != nil { + return + } + _, chans, reqs, err := ssh.NewServerConn(c, s.config) + if err != nil { + return + } + go ssh.DiscardRequests(reqs) + defer c.Close() + + for newChannel := range chans { + if newChannel.ChannelType() != "session" { + newChannel.Reject(ssh.UnknownChannelType, "unknown channel type") + continue + } + + channel, requests, err := newChannel.Accept() + if err != nil { + continue + } + + go func(in <-chan *ssh.Request) { + for req := range in { + ok := false + switch req.Type { + case "exec": + ok = true + go func() { + channel.SendRequest("exit-status", false, ssh.Marshal(&exitStatusMsg{Status: 0})) + channel.Close() + }() + } + if req.WantReply { + req.Reply(ok, nil) + } + } + }(requests) + } + } +} + +func (s *goTestServer) Close() error { + err := s.listener.Close() + // wait for the accept loop to exit + <-s.done + return err +} diff --git a/ssh/test/sshcli_test.go b/ssh/test/sshcli_test.go new file mode 100644 index 0000000000..d3b85d77e2 --- /dev/null +++ b/ssh/test/sshcli_test.go @@ -0,0 +1,96 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package test + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "testing" + + "golang.org/x/crypto/internal/testenv" + "golang.org/x/crypto/ssh" + "golang.org/x/crypto/ssh/testdata" +) + +func sshClient(t *testing.T) string { + if testing.Short() { + t.Skip("Skipping test that executes OpenSSH in -short mode") + } + sshCLI := os.Getenv("SSH_CLI_PATH") + if sshCLI == "" { + sshCLI = "ssh" + } + var err error + sshCLI, err = exec.LookPath(sshCLI) + if err != nil { + t.Skipf("Can't find an ssh(1) client to test against: %v", err) + } + return sshCLI +} + +func TestSSHCLIAuth(t *testing.T) { + sshCLI := sshClient(t) + dir := t.TempDir() + keyPrivPath := filepath.Join(dir, "rsa") + + for fn, content := range map[string][]byte{ + keyPrivPath: testdata.PEMBytes["rsa"], + keyPrivPath + ".pub": ssh.MarshalAuthorizedKey(testPublicKeys["rsa"]), + filepath.Join(dir, "rsa-cert.pub"): testdata.SSHCertificates["rsa-user-testcertificate"], + } { + if err := os.WriteFile(fn, content, 0600); err != nil { + t.Fatalf("WriteFile(%q): %v", fn, err) + } + } + + certChecker := ssh.CertChecker{ + IsUserAuthority: func(k ssh.PublicKey) bool { + return bytes.Equal(k.Marshal(), testPublicKeys["ca"].Marshal()) + }, + UserKeyFallback: func(conn ssh.ConnMetadata, key ssh.PublicKey) (*ssh.Permissions, error) { + if conn.User() == "testpubkey" && bytes.Equal(key.Marshal(), testPublicKeys["rsa"].Marshal()) { + return nil, nil + } + + return nil, fmt.Errorf("pubkey for %q not acceptable", conn.User()) + }, + } + + config := &ssh.ServerConfig{ + PublicKeyCallback: certChecker.Authenticate, + } + config.AddHostKey(testSigners["rsa"]) + + server, err := newTestServer(config) + if err != nil { + t.Fatalf("unable to start test server: %v", err) + } + defer server.Close() + + port, err := server.port() + if err != nil { + t.Fatalf("unable to get server port: %v", err) + } + + // test public key authentication. + cmd := testenv.Command(t, sshCLI, "-vvv", "-i", keyPrivPath, "-o", "StrictHostKeyChecking=no", + "-p", port, "testpubkey@127.0.0.1", "true") + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("public key authentication failed, error: %v, command output %q", err, string(out)) + } + // Test SSH user certificate authentication. + // The username must match one of the principals included in the certificate. + // The certificate "rsa-user-testcertificate" has "testcertificate" as principal. + cmd = testenv.Command(t, sshCLI, "-vvv", "-i", keyPrivPath, "-o", "StrictHostKeyChecking=no", + "-p", port, "testcertificate@127.0.0.1", "true") + out, err = cmd.CombinedOutput() + if err != nil { + t.Fatalf("user certificate authentication failed, error: %v, command output %q", err, string(out)) + } +} diff --git a/ssh/testdata/keys.go b/ssh/testdata/keys.go index ad95a81929..69f9eef4de 100644 --- a/ssh/testdata/keys.go +++ b/ssh/testdata/keys.go @@ -195,6 +195,12 @@ var SSHCertificates = map[string][]byte{ "rsa-sha2-256": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgOyK28gunJkM60qp4EbsYAjgbUsyjS8u742OLjipIgc0AAAADAQABAAAAgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQAAAAAAAAAAAAAAAgAAABRob3N0LmV4YW1wbGUuY29tLWtleQAAABQAAAAQaG9zdC5leGFtcGxlLmNvbQAAAABeSMJ4AAAAAHBPBLwAAAAAAAAAAAAAAAAAAAEXAAAAB3NzaC1yc2EAAAADAQABAAABAQC+D11D0hEbn2Vglv4YRJ8pZNyHjIGmvth3DWOQrq++2vH2MujmGQDxfr4SVE9GpMBlKU3lwGbpgIBxAg6yZcNSfo6PWVU9ACg6NMFO+yMzc2MaG+/naQdNjSewywF5j2rkNO2XOaViRVSrZroe2B/aY2LTV0jDl8nu5NOjwRs1/s7SLe5z1rw/X0dpmXk0qJY3gQhmR8HZZ1dhEkJUGwaBCPd0T8asSYf1Ag2rUD4aQ28r3q69mbwfWOOa6rMemVZruUV5dzHwVNVNtVv+ImtnYtz8m8g+K0plaGptHn3KsaOnASkh3tujhaE7kvc4HR9Igli9+76jhZie3h/dTN5zAAABFAAAAAxyc2Etc2hhMi0yNTYAAAEAbG4De/+QiqopPS3O1H7ySeEUCY56qmdgr02sFErnihdXPDaWXUXxacvJHaEtLrSTSaPL/3v3iKvjLWDOHaQ5c+cN9J7Tqzso7RQCXZD2nK9bwCUyBoiDyBCRe8w4DQEtfL5okpVzQsSAiojQ8hBohMOpy3gFfXrdm4PVC1ZKqlZh4fAc7ajieRq/Tpq2xOLdHwxkcgPNR83WVHva6K9/xjev/5n227/gkHo0qbGs8YYDOFXIEhENi+B23IzxdNVieWdyQpYpe0C2i95Jhyo0wJmaFY2ArruTS+D1jGQQpMPvAQRy26/A5hI83GLhpwyhrN/M8wCxzAhyPL6Ieuh5tQ== host.example.com `), "rsa-sha2-512": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgFGv4IpXfs4L/Y0b3rmUdPFhWoUrVnXuPxXr6aHGs7wgAAAADAQABAAAAgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQAAAAAAAAAAAAAAAgAAABRob3N0LmV4YW1wbGUuY29tLWtleQAAABQAAAAQaG9zdC5leGFtcGxlLmNvbQAAAABeSMRYAAAAAHBPBp4AAAAAAAAAAAAAAAAAAAEXAAAAB3NzaC1yc2EAAAADAQABAAABAQC+D11D0hEbn2Vglv4YRJ8pZNyHjIGmvth3DWOQrq++2vH2MujmGQDxfr4SVE9GpMBlKU3lwGbpgIBxAg6yZcNSfo6PWVU9ACg6NMFO+yMzc2MaG+/naQdNjSewywF5j2rkNO2XOaViRVSrZroe2B/aY2LTV0jDl8nu5NOjwRs1/s7SLe5z1rw/X0dpmXk0qJY3gQhmR8HZZ1dhEkJUGwaBCPd0T8asSYf1Ag2rUD4aQ28r3q69mbwfWOOa6rMemVZruUV5dzHwVNVNtVv+ImtnYtz8m8g+K0plaGptHn3KsaOnASkh3tujhaE7kvc4HR9Igli9+76jhZie3h/dTN5zAAABFAAAAAxyc2Etc2hhMi01MTIAAAEAnF4fVj6mm+UFeNCIf9AKJCv9WzymjjPvzzmaMWWkPWqoV0P0m5SiYfvbY9SbA73Blpv8SOr0DmpublF183kodREia4KyVuC8hLhSCV2Y16hy9MBegOZMepn80w+apj7Rn9QCz5OfEakDdztp6OWTBtqxnZFcTQ4XrgFkNWeWRElGdEvAVNn2WHwHi4EIdz0mdv48Imv5SPlOuW862ZdFG4Do1dUfDIiGsBofLlgcyIYlf+eNHul6sBeUkuwFxisMpI5DQzNp8PX1g/QJA2wzwT674PTqDXNttKjyh50Fdr4sXxm9Gz1+jVLoESvFNa55ERdSyAqNu4wTy11MZsWwSA== host.example.com +`), + // Assumes "ca" key above in file named "ca", sign a user cert for "rsa.pub" + // using "testcertificate" as principal: + // + // ssh-keygen -s ca -I username -n testcertificate rsa.pub + "rsa-user-testcertificate": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgr0MnhSf+KkQWEQmweJOGsJfOrUt80pQZDaI9YiwSfDUAAAADAQABAAAAgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQAAAAAAAAAAAAAAAQAAAAh1c2VybmFtZQAAABMAAAAPdGVzdGNlcnRpZmljYXRlAAAAAAAAAAD//////////wAAAAAAAACCAAAAFXBlcm1pdC1YMTEtZm9yd2FyZGluZwAAAAAAAAAXcGVybWl0LWFnZW50LWZvcndhcmRpbmcAAAAAAAAAFnBlcm1pdC1wb3J0LWZvcndhcmRpbmcAAAAAAAAACnBlcm1pdC1wdHkAAAAAAAAADnBlcm1pdC11c2VyLXJjAAAAAAAAAAAAAAEXAAAAB3NzaC1yc2EAAAADAQABAAABAQC+D11D0hEbn2Vglv4YRJ8pZNyHjIGmvth3DWOQrq++2vH2MujmGQDxfr4SVE9GpMBlKU3lwGbpgIBxAg6yZcNSfo6PWVU9ACg6NMFO+yMzc2MaG+/naQdNjSewywF5j2rkNO2XOaViRVSrZroe2B/aY2LTV0jDl8nu5NOjwRs1/s7SLe5z1rw/X0dpmXk0qJY3gQhmR8HZZ1dhEkJUGwaBCPd0T8asSYf1Ag2rUD4aQ28r3q69mbwfWOOa6rMemVZruUV5dzHwVNVNtVv+ImtnYtz8m8g+K0plaGptHn3KsaOnASkh3tujhaE7kvc4HR9Igli9+76jhZie3h/dTN5zAAABFAAAAAxyc2Etc2hhMi01MTIAAAEAFuA+67KvnlmcodIp0Lv4mR9UW/CHghAaN1csBJTkI8mx3wXKyIPTsS2uXboEhWD0a7S9gps2SEwC5m6E3kV2Rzg7aH1S03GZqMvVlK2VHe7fzuoW2yOKk6yEPjeTF0pKCFbUQ6mce8pRpD/zdvjG0Z287XM3c3Axlrn7qq7TS0MDTjEZ/dsUNFHxep3co/HuAsWVWPVDItr/FPnvZ6WVH1yc8N/AJn0gLHobkGgug22vI9rNIge1wrnXxU9BUSouzkau/PQsrCQapnn+I1H7HaQt0wdk45nxMP+ags+HRI9qpX/p8WDn6+zpqYqN/nfw2aoytyaJqhsV32Teuqtrgg== rsa.pub `), } From 74c2ba9521f1b7082a53cd34a0c4a01284cbd99b Mon Sep 17 00:00:00 2001 From: Eric Lagergren Date: Fri, 12 Aug 2022 10:22:33 -0700 Subject: [PATCH 06/95] crypto/hkdf: remove useless call to Reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HKDF is commonly used to read keys that are the the same length (or smaller) than the size of the hash digest, which means the loop inside Read only runs once. In that case, calling Reset is unnecesssary overhead. name old time/op new time/op delta 16ByteMD5Single-8 1.39µs ± 1% 1.22µs ± 0% -11.95% (p=0.000 n=10+9) 20ByteSHA1Single-8 826ns ± 0% 746ns ± 0% -9.70% (p=0.000 n=9+10) 32ByteSHA256Single-8 838ns ± 1% 744ns ± 0% -11.29% (p=0.000 n=10+10) 64ByteSHA512Single-8 5.12µs ± 0% 4.57µs ± 0% -10.78% (p=0.000 n=8+10) 8ByteMD5Stream-8 137ns ± 0% 138ns ± 0% +0.27% (p=0.009 n=9+6) 16ByteMD5Stream-8 264ns ± 0% 265ns ± 0% +0.29% (p=0.000 n=10+10) 8ByteSHA1Stream-8 64.1ns ± 0% 64.4ns ± 0% +0.60% (p=0.000 n=9+9) 20ByteSHA1Stream-8 145ns ± 0% 146ns ± 1% +0.69% (p=0.000 n=9+10) 8ByteSHA256Stream-8 42.9ns ± 1% 43.1ns ± 0% +0.48% (p=0.005 n=10+10) 32ByteSHA256Stream-8 151ns ± 0% 152ns ± 0% +0.35% (p=0.006 n=10+8) 8ByteSHA512Stream-8 139ns ± 0% 139ns ± 0% +0.08% (p=0.035 n=9+10) 64ByteSHA512Stream-8 1.07µs ± 0% 1.07µs ± 0% +0.33% (p=0.000 n=9+10) name old speed new speed delta 16ByteMD5Single-8 11.6MB/s ± 0% 13.1MB/s ± 0% +13.50% (p=0.000 n=9+9) 20ByteSHA1Single-8 24.2MB/s ± 0% 26.8MB/s ± 0% +10.75% (p=0.000 n=9+10) 32ByteSHA256Single-8 38.2MB/s ± 1% 43.0MB/s ± 0% +12.72% (p=0.000 n=10+10) 64ByteSHA512Single-8 12.5MB/s ± 0% 14.0MB/s ± 0% +12.06% (p=0.000 n=8+10) 8ByteMD5Stream-8 58.2MB/s ± 0% 58.1MB/s ± 0% -0.27% (p=0.004 n=9+9) 16ByteMD5Stream-8 60.6MB/s ± 0% 60.5MB/s ± 0% -0.27% (p=0.000 n=9+10) 8ByteSHA1Stream-8 125MB/s ± 0% 124MB/s ± 0% -0.59% (p=0.000 n=9+9) 20ByteSHA1Stream-8 138MB/s ± 0% 137MB/s ± 1% -0.69% (p=0.000 n=9+10) 8ByteSHA256Stream-8 186MB/s ± 1% 185MB/s ± 0% -0.47% (p=0.005 n=10+10) 32ByteSHA256Stream-8 211MB/s ± 0% 211MB/s ± 0% -0.37% (p=0.003 n=10+8) 8ByteSHA512Stream-8 57.4MB/s ± 1% 57.4MB/s ± 0% ~ (p=0.137 n=10+10) 64ByteSHA512Stream-8 59.9MB/s ± 0% 59.7MB/s ± 0% -0.33% (p=0.000 n=9+10) name old alloc/op new alloc/op delta 16ByteMD5Single-8 1.17kB ± 0% 0.98kB ± 0% -16.42% (p=0.000 n=10+10) 20ByteSHA1Single-8 1.25kB ± 0% 1.06kB ± 0% -15.37% (p=0.000 n=10+10) 32ByteSHA256Single-8 1.36kB ± 0% 1.14kB ± 0% -16.46% (p=0.000 n=10+10) 64ByteSHA512Single-8 2.26kB ± 0% 1.84kB ± 0% -18.43% (p=0.000 n=10+10) 8ByteMD5Stream-8 2.00B ± 0% 2.00B ± 0% ~ (all equal) 16ByteMD5Stream-8 5.00B ± 0% 5.00B ± 0% ~ (all equal) 8ByteSHA1Stream-8 2.00B ± 0% 2.00B ± 0% ~ (all equal) 20ByteSHA1Stream-8 5.00B ± 0% 5.00B ± 0% ~ (all equal) 8ByteSHA256Stream-8 1.00B ± 0% 1.00B ± 0% ~ (all equal) 32ByteSHA256Stream-8 6.00B ± 0% 6.00B ± 0% ~ (all equal) 8ByteSHA512Stream-8 1.00B ± 0% 1.00B ± 0% ~ (all equal) 64ByteSHA512Stream-8 9.00B ± 0% 9.00B ± 0% ~ (all equal) name old allocs/op new allocs/op delta 16ByteMD5Single-8 16.0 ± 0% 14.0 ± 0% -12.50% (p=0.000 n=10+10) 20ByteSHA1Single-8 16.0 ± 0% 14.0 ± 0% -12.50% (p=0.000 n=10+10) 32ByteSHA256Single-8 16.0 ± 0% 14.0 ± 0% -12.50% (p=0.000 n=10+10) 64ByteSHA512Single-8 16.0 ± 0% 14.0 ± 0% -12.50% (p=0.000 n=10+10) 8ByteMD5Stream-8 0.00 0.00 ~ (all equal) 16ByteMD5Stream-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) 8ByteSHA1Stream-8 0.00 0.00 ~ (all equal) 20ByteSHA1Stream-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) 8ByteSHA256Stream-8 0.00 0.00 ~ (all equal) 32ByteSHA256Stream-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) 8ByteSHA512Stream-8 0.00 0.00 ~ (all equal) 64ByteSHA512Stream-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Change-Id: I08e5deb7045ce9aa63428e0ac7ae6140226414b4 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/423414 Reviewed-by: Roland Shoemaker TryBot-Result: Gopher Robot Run-TryBot: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Tatiana Bradley --- hkdf/hkdf.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hkdf/hkdf.go b/hkdf/hkdf.go index dda3f143be..f4ded5fee2 100644 --- a/hkdf/hkdf.go +++ b/hkdf/hkdf.go @@ -56,7 +56,9 @@ func (f *hkdf) Read(p []byte) (int, error) { // Fill the rest of the buffer for len(p) > 0 { - f.expander.Reset() + if f.counter > 1 { + f.expander.Reset() + } f.expander.Write(f.prev) f.expander.Write(f.info) f.expander.Write([]byte{f.counter}) From df0bc9e1bb5eaba2300400ba5294aae0a10b6a54 Mon Sep 17 00:00:00 2001 From: Mauri de Souza Meneguzzo Date: Tue, 7 Nov 2023 18:34:49 +0000 Subject: [PATCH 07/95] chacha20poly1305: guard PSHUFB usage with GOAMD64_v2 The PSHUFB instruction is part of SSE which is only v2+ but it is being used without the GOAMD64_v2 guard. The ROL macros were copied from CL 516859 that adds internal/chacha8rand. Fixes golang/go#63871 Change-Id: I3c8ba75ff284cda4fc788885643246936e617b85 GitHub-Last-Rev: e235e8eae67f16b3a58817cfdff729693faf2665 GitHub-Pull-Request: golang/crypto#275 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/538786 Reviewed-by: Keith Randall Run-TryBot: Mauri de Souza Meneguzzo TryBot-Result: Gopher Robot Reviewed-by: Keith Randall Reviewed-by: Russ Cox --- chacha20poly1305/chacha20poly1305_amd64.s | 24 +++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s index 541d696b67..731d2ac6db 100644 --- a/chacha20poly1305/chacha20poly1305_amd64.s +++ b/chacha20poly1305/chacha20poly1305_amd64.s @@ -183,11 +183,31 @@ GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 + // Some macros + +// ROL rotates the uint32s in register R left by N bits, using temporary T. +#define ROL(N, R, T) \ + MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R + +// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. +#ifdef GOAMD64_v2 +#define ROL16(R, T) PSHUFB ·rol16<>(SB), R +#else +#define ROL16(R, T) ROL(16, R, T) +#endif + +// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. +#ifdef GOAMD64_v2 +#define ROL8(R, T) PSHUFB ·rol8<>(SB), R +#else +#define ROL8(R, T) ROL(8, R, T) +#endif + #define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ + PADDD B, A; PXOR A, D; ROL16(D, T) \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ + PADDD B, A; PXOR A, D; ROL8(D, T) \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B #define chachaQR_AVX2(A, B, C, D, T) \ From e668aa9b451cd0866ba1c81c26309815c020c61f Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Wed, 8 Nov 2023 02:52:58 +0000 Subject: [PATCH 08/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Ie3a6928d78080ff7d75d6813b1760f1317baa081 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/540537 Auto-Submit: Gopher Robot Reviewed-by: Heschi Kreinick LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index c5c2f6d9ea..90eec5425c 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.10.0 // tagx:ignore - golang.org/x/sys v0.13.0 - golang.org/x/term v0.13.0 + golang.org/x/sys v0.14.0 + golang.org/x/term v0.14.0 ) -require golang.org/x/text v0.13.0 // indirect +require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index b99025deda..49ce5c4aee 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= -golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= -golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= +golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.14.0 h1:LGK9IlZ8T9jvdy6cTdfKUCltatMFOehAQo9SRC46UQ8= +golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 42c83fffffc70640068263e765db9c9b09cd2ba2 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Tue, 31 Oct 2023 18:02:46 +0100 Subject: [PATCH 09/95] ssh: try harder to detect incorrect passwords for legacy PEM encryption Because of deficiencies in the format, DecryptPEMBlock does not always detect an incorrect password. In these cases decrypted DER bytes is random noise. If the parsing of the key returns an asn1.StructuralError we return x509.IncorrectPasswordError. Fixes golang/go#62265 Change-Id: Ib8b845f2bd01662c1f1421d35859a32ac5b78da7 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/538835 Reviewed-by: Heschi Kreinick Reviewed-by: Filippo Valsorda Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Cherry Mui --- ssh/keys.go | 19 +++++++++++++++---- ssh/keys_test.go | 11 +++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/ssh/keys.go b/ssh/keys.go index ef1bad731b..df4ebdada5 100644 --- a/ssh/keys.go +++ b/ssh/keys.go @@ -1232,16 +1232,27 @@ func ParseRawPrivateKeyWithPassphrase(pemBytes, passphrase []byte) (interface{}, return nil, fmt.Errorf("ssh: cannot decode encrypted private keys: %v", err) } + var result interface{} + switch block.Type { case "RSA PRIVATE KEY": - return x509.ParsePKCS1PrivateKey(buf) + result, err = x509.ParsePKCS1PrivateKey(buf) case "EC PRIVATE KEY": - return x509.ParseECPrivateKey(buf) + result, err = x509.ParseECPrivateKey(buf) case "DSA PRIVATE KEY": - return ParseDSAPrivateKey(buf) + result, err = ParseDSAPrivateKey(buf) default: - return nil, fmt.Errorf("ssh: unsupported key type %q", block.Type) + err = fmt.Errorf("ssh: unsupported key type %q", block.Type) } + // Because of deficiencies in the format, DecryptPEMBlock does not always + // detect an incorrect password. In these cases decrypted DER bytes is + // random noise. If the parsing of the key returns an asn1.StructuralError + // we return x509.IncorrectPasswordError. + if _, ok := err.(asn1.StructuralError); ok { + return nil, x509.IncorrectPasswordError + } + + return result, err } // ParseDSAPrivateKey returns a DSA private key from its ASN.1 DER encoding, as diff --git a/ssh/keys_test.go b/ssh/keys_test.go index 76d2338f43..3e18488f34 100644 --- a/ssh/keys_test.go +++ b/ssh/keys_test.go @@ -16,6 +16,7 @@ import ( "encoding/base64" "encoding/hex" "encoding/pem" + "errors" "fmt" "io" "reflect" @@ -221,6 +222,16 @@ func TestParseEncryptedPrivateKeysWithPassphrase(t *testing.T) { } } +func TestParseEncryptedPrivateKeysWithIncorrectPassphrase(t *testing.T) { + pem := testdata.PEMEncryptedKeys[0].PEMBytes + for i := 0; i < 4096; i++ { + _, err := ParseRawPrivateKeyWithPassphrase(pem, []byte(fmt.Sprintf("%d", i))) + if !errors.Is(err, x509.IncorrectPasswordError) { + t.Fatalf("expected error: %v, got: %v", x509.IncorrectPasswordError, err) + } + } +} + func TestParseDSA(t *testing.T) { // We actually exercise the ParsePrivateKey codepath here, as opposed to // using the ParseRawPrivateKey+NewSignerFromKey path that testdata_test.go From eb61739cd99fb244c7cd188d3c5bae54824e781d Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Tue, 18 Jul 2023 19:01:21 +0200 Subject: [PATCH 10/95] ssh: allow to configure public key auth algorithms on the server side Fixes golang/go#61244 Change-Id: I29b43e379cf0cdb07b0d6935666491b997157e73 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/510775 TryBot-Result: Gopher Robot Reviewed-by: Bryan Mills Commit-Queue: Nicola Murino Run-TryBot: Nicola Murino Auto-Submit: Nicola Murino Reviewed-by: Han-Wen Nienhuys --- ssh/common.go | 3 -- ssh/handshake.go | 7 ++++ ssh/server.go | 20 +++++++++-- ssh/server_test.go | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 5 deletions(-) create mode 100644 ssh/server_test.go diff --git a/ssh/common.go b/ssh/common.go index b419c761ed..dd2ab0d69a 100644 --- a/ssh/common.go +++ b/ssh/common.go @@ -10,7 +10,6 @@ import ( "fmt" "io" "math" - "strings" "sync" _ "crypto/sha1" @@ -140,8 +139,6 @@ var supportedPubKeyAuthAlgos = []string{ KeyAlgoDSA, } -var supportedPubKeyAuthAlgosList = strings.Join(supportedPubKeyAuthAlgos, ",") - // unexpectedMessageError results when the SSH message that we received didn't // match what we wanted. func unexpectedMessageError(expected, got uint8) error { diff --git a/ssh/handshake.go b/ssh/handshake.go index 70a7369ff9..49bbba7692 100644 --- a/ssh/handshake.go +++ b/ssh/handshake.go @@ -11,6 +11,7 @@ import ( "io" "log" "net" + "strings" "sync" ) @@ -50,6 +51,10 @@ type handshakeTransport struct { // connection. hostKeys []Signer + // publicKeyAuthAlgorithms is non-empty if we are the server. In that case, + // it contains the supported client public key authentication algorithms. + publicKeyAuthAlgorithms []string + // hostKeyAlgorithms is non-empty if we are the client. In that case, // we accept these key types from the server as host key. hostKeyAlgorithms []string @@ -141,6 +146,7 @@ func newClientTransport(conn keyingTransport, clientVersion, serverVersion []byt func newServerTransport(conn keyingTransport, clientVersion, serverVersion []byte, config *ServerConfig) *handshakeTransport { t := newHandshakeTransport(conn, &config.Config, clientVersion, serverVersion) t.hostKeys = config.hostKeys + t.publicKeyAuthAlgorithms = config.PublicKeyAuthAlgorithms go t.readLoop() go t.kexLoop() return t @@ -649,6 +655,7 @@ func (t *handshakeTransport) enterKeyExchange(otherInitPacket []byte) error { // message with the server-sig-algs extension if the client supports it. See // RFC 8308, Sections 2.4 and 3.1, and [PROTOCOL], Section 1.9. if !isClient && firstKeyExchange && contains(clientInit.KexAlgos, "ext-info-c") { + supportedPubKeyAuthAlgosList := strings.Join(t.publicKeyAuthAlgorithms, ",") extInfo := &extInfoMsg{ NumExtensions: 2, Payload: make([]byte, 0, 4+15+4+len(supportedPubKeyAuthAlgosList)+4+16+4+1), diff --git a/ssh/server.go b/ssh/server.go index 727c71b9c7..8f1505af94 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -64,6 +64,13 @@ type ServerConfig struct { // Config contains configuration shared between client and server. Config + // PublicKeyAuthAlgorithms specifies the supported client public key + // authentication algorithms. Note that this should not include certificate + // types since those use the underlying algorithm. This list is sent to the + // client if it supports the server-sig-algs extension. Order is irrelevant. + // If unspecified then a default set of algorithms is used. + PublicKeyAuthAlgorithms []string + hostKeys []Signer // NoClientAuth is true if clients are allowed to connect without @@ -201,6 +208,15 @@ func NewServerConn(c net.Conn, config *ServerConfig) (*ServerConn, <-chan NewCha if fullConf.MaxAuthTries == 0 { fullConf.MaxAuthTries = 6 } + if len(fullConf.PublicKeyAuthAlgorithms) == 0 { + fullConf.PublicKeyAuthAlgorithms = supportedPubKeyAuthAlgos + } else { + for _, algo := range fullConf.PublicKeyAuthAlgorithms { + if !contains(supportedPubKeyAuthAlgos, algo) { + return nil, nil, nil, fmt.Errorf("ssh: unsupported public key authentication algorithm %s", algo) + } + } + } // Check if the config contains any unsupported key exchanges for _, kex := range fullConf.KeyExchanges { if _, ok := serverForbiddenKexAlgos[kex]; ok { @@ -524,7 +540,7 @@ userAuthLoop: return nil, parseError(msgUserAuthRequest) } algo := string(algoBytes) - if !contains(supportedPubKeyAuthAlgos, underlyingAlgo(algo)) { + if !contains(config.PublicKeyAuthAlgorithms, underlyingAlgo(algo)) { authErr = fmt.Errorf("ssh: algorithm %q not accepted", algo) break } @@ -591,7 +607,7 @@ userAuthLoop: // algorithm name that corresponds to algo with // sig.Format. This is usually the same, but // for certs, the names differ. - if !contains(supportedPubKeyAuthAlgos, sig.Format) { + if !contains(config.PublicKeyAuthAlgorithms, sig.Format) { authErr = fmt.Errorf("ssh: algorithm %q not accepted", sig.Format) break } diff --git a/ssh/server_test.go b/ssh/server_test.go new file mode 100644 index 0000000000..2145dce06f --- /dev/null +++ b/ssh/server_test.go @@ -0,0 +1,85 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssh + +import ( + "testing" +) + +func TestClientAuthRestrictedPublicKeyAlgos(t *testing.T) { + for _, tt := range []struct { + name string + key Signer + wantError bool + }{ + {"rsa", testSigners["rsa"], false}, + {"dsa", testSigners["dsa"], true}, + {"ed25519", testSigners["ed25519"], true}, + } { + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + serverConf := &ServerConfig{ + PublicKeyAuthAlgorithms: []string{KeyAlgoRSASHA256, KeyAlgoRSASHA512}, + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + return nil, nil + }, + } + serverConf.AddHostKey(testSigners["ecdsap256"]) + + done := make(chan struct{}) + go func() { + defer close(done) + NewServerConn(c1, serverConf) + }() + + clientConf := ClientConfig{ + User: "user", + Auth: []AuthMethod{ + PublicKeys(tt.key), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, _, _, err = NewClientConn(c2, "", &clientConf) + if err != nil { + if !tt.wantError { + t.Errorf("%s: got unexpected error %q", tt.name, err.Error()) + } + } else if tt.wantError { + t.Errorf("%s: succeeded, but want error", tt.name) + } + <-done + } +} + +func TestNewServerConnValidationErrors(t *testing.T) { + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + + serverConf := &ServerConfig{ + PublicKeyAuthAlgorithms: []string{CertAlgoRSAv01}, + } + _, _, _, err = NewServerConn(c1, serverConf) + if err == nil { + t.Fatal("NewServerConn with invalid public key auth algorithms succeeded") + } + serverConf = &ServerConfig{ + Config: Config{ + KeyExchanges: []string{kexAlgoDHGEXSHA256}, + }, + } + _, _, _, err = NewServerConn(c1, serverConf) + if err == nil { + t.Fatal("NewServerConn with unsupported key exchange succeeded") + } +} From ff15cd57d18f87d81a83bf288597042b2e50aaef Mon Sep 17 00:00:00 2001 From: "Bryan C. Mills" Date: Thu, 9 Nov 2023 09:23:46 -0500 Subject: [PATCH 11/95] ssh: eliminate some goroutine leaks in tests and examples This should fix the "Log in goroutine" panic seen in https://build.golang.org/log/e42bf69fc002113dbccfe602a6c67fd52e8f31df, as well as a few other related leaks. It also helps to verify that none of the functions under test deadlock unexpectedly. See https://go.dev/wiki/CodeReviewComments#goroutine-lifetimes. Updates golang/go#58901. Change-Id: Ica943444db381ae1accb80b101ea646e28ebf4f9 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/541095 Auto-Submit: Bryan Mills LUCI-TryBot-Result: Go LUCI Reviewed-by: Nicola Murino Reviewed-by: Heschi Kreinick --- ssh/example_test.go | 18 ++++++++- ssh/mux_test.go | 91 ++++++++++++++++++++++++--------------------- ssh/session_test.go | 71 +++++++++++++++++++++++++++++------ 3 files changed, 124 insertions(+), 56 deletions(-) diff --git a/ssh/example_test.go b/ssh/example_test.go index 0a6b0767c9..3920832c1a 100644 --- a/ssh/example_test.go +++ b/ssh/example_test.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "strings" + "sync" "golang.org/x/crypto/ssh" "golang.org/x/crypto/ssh/terminal" @@ -98,8 +99,15 @@ func ExampleNewServerConn() { } log.Printf("logged in with key %s", conn.Permissions.Extensions["pubkey-fp"]) + var wg sync.WaitGroup + defer wg.Wait() + // The incoming Request channel must be serviced. - go ssh.DiscardRequests(reqs) + wg.Add(1) + go func() { + ssh.DiscardRequests(reqs) + wg.Done() + }() // Service the incoming Channel channel. for newChannel := range chans { @@ -119,16 +127,22 @@ func ExampleNewServerConn() { // Sessions have out-of-band requests such as "shell", // "pty-req" and "env". Here we handle only the // "shell" request. + wg.Add(1) go func(in <-chan *ssh.Request) { for req := range in { req.Reply(req.Type == "shell", nil) } + wg.Done() }(requests) term := terminal.NewTerminal(channel, "> ") + wg.Add(1) go func() { - defer channel.Close() + defer func() { + channel.Close() + wg.Done() + }() for { line, err := term.ReadLine() if err != nil { diff --git a/ssh/mux_test.go b/ssh/mux_test.go index 1db3be54a0..eae637d5e2 100644 --- a/ssh/mux_test.go +++ b/ssh/mux_test.go @@ -10,7 +10,6 @@ import ( "io" "sync" "testing" - "time" ) func muxPair() (*mux, *mux) { @@ -112,7 +111,11 @@ func TestMuxReadWrite(t *testing.T) { magic := "hello world" magicExt := "hello stderr" + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) go func() { + defer wg.Done() _, err := s.Write([]byte(magic)) if err != nil { t.Errorf("Write: %v", err) @@ -152,13 +155,15 @@ func TestMuxChannelOverflow(t *testing.T) { defer writer.Close() defer mux.Close() - wDone := make(chan int, 1) + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) go func() { + defer wg.Done() if _, err := writer.Write(make([]byte, channelWindowSize)); err != nil { t.Errorf("could not fill window: %v", err) } writer.Write(make([]byte, 1)) - wDone <- 1 }() writer.remoteWin.waitWriterBlocked() @@ -175,7 +180,6 @@ func TestMuxChannelOverflow(t *testing.T) { if _, err := reader.SendRequest("hello", true, nil); err == nil { t.Errorf("SendRequest succeeded.") } - <-wDone } func TestMuxChannelCloseWriteUnblock(t *testing.T) { @@ -184,20 +188,21 @@ func TestMuxChannelCloseWriteUnblock(t *testing.T) { defer writer.Close() defer mux.Close() - wDone := make(chan int, 1) + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) go func() { + defer wg.Done() if _, err := writer.Write(make([]byte, channelWindowSize)); err != nil { t.Errorf("could not fill window: %v", err) } if _, err := writer.Write(make([]byte, 1)); err != io.EOF { t.Errorf("got %v, want EOF for unblock write", err) } - wDone <- 1 }() writer.remoteWin.waitWriterBlocked() reader.Close() - <-wDone } func TestMuxConnectionCloseWriteUnblock(t *testing.T) { @@ -206,20 +211,21 @@ func TestMuxConnectionCloseWriteUnblock(t *testing.T) { defer writer.Close() defer mux.Close() - wDone := make(chan int, 1) + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) go func() { + defer wg.Done() if _, err := writer.Write(make([]byte, channelWindowSize)); err != nil { t.Errorf("could not fill window: %v", err) } if _, err := writer.Write(make([]byte, 1)); err != io.EOF { t.Errorf("got %v, want EOF for unblock write", err) } - wDone <- 1 }() writer.remoteWin.waitWriterBlocked() mux.Close() - <-wDone } func TestMuxReject(t *testing.T) { @@ -227,7 +233,12 @@ func TestMuxReject(t *testing.T) { defer server.Close() defer client.Close() + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) go func() { + defer wg.Done() + ch, ok := <-server.incomingChannels if !ok { t.Error("cannot accept channel") @@ -267,6 +278,7 @@ func TestMuxChannelRequest(t *testing.T) { var received int var wg sync.WaitGroup + t.Cleanup(wg.Wait) wg.Add(1) go func() { for r := range server.incomingRequests { @@ -295,7 +307,6 @@ func TestMuxChannelRequest(t *testing.T) { } if ok { t.Errorf("SendRequest(no): %v", ok) - } client.Close() @@ -389,13 +400,8 @@ func TestMuxUnknownChannelRequests(t *testing.T) { // Wait for the server to send the keepalive message and receive back a // response. - select { - case err := <-kDone: - if err != nil { - t.Fatal(err) - } - case <-time.After(10 * time.Second): - t.Fatalf("server never received ack") + if err := <-kDone; err != nil { + t.Fatal(err) } // Confirm client hasn't closed. @@ -403,13 +409,9 @@ func TestMuxUnknownChannelRequests(t *testing.T) { t.Fatalf("failed to send keepalive: %v", err) } - select { - case err := <-kDone: - if err != nil { - t.Fatal(err) - } - case <-time.After(10 * time.Second): - t.Fatalf("server never shut down") + // Wait for the server to shut down. + if err := <-kDone; err != nil { + t.Fatal(err) } } @@ -525,11 +527,7 @@ func TestMuxClosedChannel(t *testing.T) { defer ch.Close() // Wait for the server to close the channel and send the keepalive. - select { - case <-kDone: - case <-time.After(10 * time.Second): - t.Fatalf("server never received ack") - } + <-kDone // Make sure the channel closed. if _, ok := <-ch.incomingRequests; ok { @@ -541,22 +539,29 @@ func TestMuxClosedChannel(t *testing.T) { t.Fatalf("failed to send keepalive: %v", err) } - select { - case <-kDone: - case <-time.After(10 * time.Second): - t.Fatalf("server never shut down") - } + // Wait for the server to shut down. + <-kDone } func TestMuxGlobalRequest(t *testing.T) { + var sawPeek bool + var wg sync.WaitGroup + defer func() { + wg.Wait() + if !sawPeek { + t.Errorf("never saw 'peek' request") + } + }() + clientMux, serverMux := muxPair() defer serverMux.Close() defer clientMux.Close() - var seen bool + wg.Add(1) go func() { + defer wg.Done() for r := range serverMux.incomingRequests { - seen = seen || r.Type == "peek" + sawPeek = sawPeek || r.Type == "peek" if r.WantReply { err := r.Reply(r.Type == "yes", append([]byte(r.Type), r.Payload...)) @@ -586,10 +591,6 @@ func TestMuxGlobalRequest(t *testing.T) { t.Errorf("SendRequest(\"no\", true, \"a\"): %v %v %v", ok, data, err) } - - if !seen { - t.Errorf("never saw 'peek' request") - } } func TestMuxGlobalRequestUnblock(t *testing.T) { @@ -739,7 +740,13 @@ func TestMuxMaxPacketSize(t *testing.T) { t.Errorf("could not send packet") } - go a.SendRequest("hello", false, nil) + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) + go func() { + a.SendRequest("hello", false, nil) + wg.Done() + }() _, ok := <-b.incomingRequests if ok { diff --git a/ssh/session_test.go b/ssh/session_test.go index 521677f9b1..807a913e5a 100644 --- a/ssh/session_test.go +++ b/ssh/session_test.go @@ -13,6 +13,7 @@ import ( "io" "math/rand" "net" + "sync" "testing" "golang.org/x/crypto/ssh/terminal" @@ -27,8 +28,14 @@ func dial(handler serverType, t *testing.T) *Client { t.Fatalf("netPipe: %v", err) } + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) go func() { - defer c1.Close() + defer func() { + c1.Close() + wg.Done() + }() conf := ServerConfig{ NoClientAuth: true, } @@ -39,7 +46,11 @@ func dial(handler serverType, t *testing.T) *Client { t.Errorf("Unable to handshake: %v", err) return } - go DiscardRequests(reqs) + wg.Add(1) + go func() { + DiscardRequests(reqs) + wg.Done() + }() for newCh := range chans { if newCh.ChannelType() != "session" { @@ -52,8 +63,10 @@ func dial(handler serverType, t *testing.T) *Client { t.Errorf("Accept: %v", err) continue } + wg.Add(1) go func() { handler(ch, inReqs, t) + wg.Done() }() } if err := conn.Wait(); err != io.EOF { @@ -338,8 +351,13 @@ func TestServerWindow(t *testing.T) { t.Fatal(err) } defer session.Close() - result := make(chan []byte) + serverStdin, err := session.StdinPipe() + if err != nil { + t.Fatalf("StdinPipe failed: %v", err) + } + + result := make(chan []byte) go func() { defer close(result) echoedBuf := bytes.NewBuffer(make([]byte, 0, windowTestBytes)) @@ -355,10 +373,6 @@ func TestServerWindow(t *testing.T) { result <- echoedBuf.Bytes() }() - serverStdin, err := session.StdinPipe() - if err != nil { - t.Fatalf("StdinPipe failed: %v", err) - } written, err := copyNRandomly("stdin", serverStdin, origBuf, windowTestBytes) if err != nil { t.Errorf("failed to copy origBuf to serverStdin: %v", err) @@ -648,29 +662,44 @@ func TestSessionID(t *testing.T) { User: "user", } + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + srvErrCh := make(chan error, 1) + wg.Add(1) go func() { + defer wg.Done() conn, chans, reqs, err := NewServerConn(c1, serverConf) srvErrCh <- err if err != nil { return } serverID <- conn.SessionID() - go DiscardRequests(reqs) + wg.Add(1) + go func() { + DiscardRequests(reqs) + wg.Done() + }() for ch := range chans { ch.Reject(Prohibited, "") } }() cliErrCh := make(chan error, 1) + wg.Add(1) go func() { + defer wg.Done() conn, chans, reqs, err := NewClientConn(c2, "", clientConf) cliErrCh <- err if err != nil { return } clientID <- conn.SessionID() - go DiscardRequests(reqs) + wg.Add(1) + go func() { + DiscardRequests(reqs) + wg.Done() + }() for ch := range chans { ch.Reject(Prohibited, "") } @@ -738,6 +767,8 @@ func TestHostKeyAlgorithms(t *testing.T) { serverConf.AddHostKey(testSigners["rsa"]) serverConf.AddHostKey(testSigners["ecdsa"]) + var wg sync.WaitGroup + t.Cleanup(wg.Wait) connect := func(clientConf *ClientConfig, want string) { var alg string clientConf.HostKeyCallback = func(h string, a net.Addr, key PublicKey) error { @@ -751,7 +782,11 @@ func TestHostKeyAlgorithms(t *testing.T) { defer c1.Close() defer c2.Close() - go NewServerConn(c1, serverConf) + wg.Add(1) + go func() { + NewServerConn(c1, serverConf) + wg.Done() + }() _, _, _, err = NewClientConn(c2, "", clientConf) if err != nil { t.Fatalf("NewClientConn: %v", err) @@ -785,7 +820,11 @@ func TestHostKeyAlgorithms(t *testing.T) { defer c1.Close() defer c2.Close() - go NewServerConn(c1, serverConf) + wg.Add(1) + go func() { + NewServerConn(c1, serverConf) + wg.Done() + }() clientConf.HostKeyAlgorithms = []string{"nonexistent-hostkey-algo"} _, _, _, err = NewClientConn(c2, "", clientConf) if err == nil { @@ -818,14 +857,22 @@ func TestServerClientAuthCallback(t *testing.T) { User: someUsername, } + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) go func() { + defer wg.Done() _, chans, reqs, err := NewServerConn(c1, serverConf) if err != nil { t.Errorf("server handshake: %v", err) userCh <- "error" return } - go DiscardRequests(reqs) + wg.Add(1) + go func() { + DiscardRequests(reqs) + wg.Done() + }() for ch := range chans { ch.Reject(Prohibited, "") } From a2edfb50727c2b04a93ccc2f0f7931a02fb623d7 Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Thu, 9 Nov 2023 11:51:11 -0800 Subject: [PATCH 12/95] cryptobyte: fix ReadOptionalASN1Boolean ReadOptionalASN1Boolean was completely broken, it would only work when there were two BOOLEAN fields in a row, with the first being OPTIONAL (which is itself invalid ASN.1 due to the ambiguity). This fixes it to properly expect a BOOLEAN wrapped in a context-specific tag, as is the case for all of the other ReadOptionalASN1* methods, and updates its doc string. This is a breaking change as it requires adding the tag field to properly support context-specific tags. Given the method would previously not work this seems like a reasonable breakage. Fixes golang/go#43019 Change-Id: I42398256216c59988e249c90bc7aa668f64df945 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/274242 Reviewed-by: Filippo Valsorda Reviewed-by: Damien Neil LUCI-TryBot-Result: Go LUCI Auto-Submit: Roland Shoemaker --- cryptobyte/asn1.go | 13 +++++++------ cryptobyte/asn1_test.go | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/cryptobyte/asn1.go b/cryptobyte/asn1.go index 6fc2838a3f..2492f796af 100644 --- a/cryptobyte/asn1.go +++ b/cryptobyte/asn1.go @@ -733,13 +733,14 @@ func (s *String) ReadOptionalASN1OctetString(out *[]byte, outPresent *bool, tag return true } -// ReadOptionalASN1Boolean sets *out to the value of the next ASN.1 BOOLEAN or, -// if the next bytes are not an ASN.1 BOOLEAN, to the value of defaultValue. -// It reports whether the operation was successful. -func (s *String) ReadOptionalASN1Boolean(out *bool, defaultValue bool) bool { +// ReadOptionalASN1Boolean attempts to read an optional ASN.1 BOOLEAN +// explicitly tagged with tag into out and advances. If no element with a +// matching tag is present, it sets "out" to defaultValue instead. It reports +// whether the read was successful. +func (s *String) ReadOptionalASN1Boolean(out *bool, tag asn1.Tag, defaultValue bool) bool { var present bool var child String - if !s.ReadOptionalASN1(&child, &present, asn1.BOOLEAN) { + if !s.ReadOptionalASN1(&child, &present, tag) { return false } @@ -748,7 +749,7 @@ func (s *String) ReadOptionalASN1Boolean(out *bool, defaultValue bool) bool { return true } - return s.ReadASN1Boolean(out) + return child.ReadASN1Boolean(out) } func (s *String) readASN1(out *String, outTag *asn1.Tag, skipHeader bool) bool { diff --git a/cryptobyte/asn1_test.go b/cryptobyte/asn1_test.go index e3f53a932e..93760b06e9 100644 --- a/cryptobyte/asn1_test.go +++ b/cryptobyte/asn1_test.go @@ -115,6 +115,28 @@ func TestReadASN1OptionalInteger(t *testing.T) { } } +const defaultBool = false + +var optionalBoolTestData = []readASN1Test{ + {"empty", []byte{}, 0xa0, true, false}, + {"invalid", []byte{0xa1, 0x3, 0x1, 0x2, 0x7f}, 0xa1, false, defaultBool}, + {"missing", []byte{0xa1, 0x3, 0x1, 0x1, 0x7f}, 0xa0, true, defaultBool}, + {"present", []byte{0xa1, 0x3, 0x1, 0x1, 0xff}, 0xa1, true, true}, +} + +func TestReadASN1OptionalBoolean(t *testing.T) { + for _, test := range optionalBoolTestData { + t.Run(test.name, func(t *testing.T) { + in := String(test.in) + var out bool + ok := in.ReadOptionalASN1Boolean(&out, test.tag, defaultBool) + if ok != test.ok || ok && out != test.out.(bool) { + t.Errorf("in.ReadOptionalASN1Boolean() = %v, want %v; out = %v, want %v", ok, test.ok, out, test.out) + } + }) + } +} + func TestReadASN1IntegerSigned(t *testing.T) { testData64 := []struct { in []byte From 1cf1811d7195fe9bb436a00e335567575fac9b07 Mon Sep 17 00:00:00 2001 From: Matt Dainty Date: Mon, 25 Jan 2021 10:36:18 +0000 Subject: [PATCH 13/95] ssh: use the correct token from the client This fixes the case where AcceptSecContext is always called with the first token sent by the client instead of the most recently sent one. Previously, despite being being read from the client and unmarshalled, it was never actually used. Fixes golang/go#43875 Change-Id: I1967d9a107af03d6778a9437b48e785d61710ee5 GitHub-Last-Rev: 0d58e4d50014fac0a9ea1eef85489172137eb8aa GitHub-Pull-Request: golang/crypto#176 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/286252 Run-TryBot: Filippo Valsorda Auto-Submit: Filippo Valsorda TryBot-Result: Gopher Robot Reviewed-by: Matthew Dempsky LUCI-TryBot-Result: Go LUCI Run-TryBot: Nicola Murino Reviewed-by: Nicola Murino Reviewed-by: Filippo Valsorda Reviewed-by: Than McIntosh --- ssh/server.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ssh/server.go b/ssh/server.go index 8f1505af94..7f0c236a9a 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -337,7 +337,7 @@ func checkSourceAddress(addr net.Addr, sourceAddrs string) error { return fmt.Errorf("ssh: remote address %v is not allowed because of source-address restriction", addr) } -func gssExchangeToken(gssapiConfig *GSSAPIWithMICConfig, firstToken []byte, s *connection, +func gssExchangeToken(gssapiConfig *GSSAPIWithMICConfig, token []byte, s *connection, sessionID []byte, userAuthReq userAuthRequestMsg) (authErr error, perms *Permissions, err error) { gssAPIServer := gssapiConfig.Server defer gssAPIServer.DeleteSecContext() @@ -347,7 +347,7 @@ func gssExchangeToken(gssapiConfig *GSSAPIWithMICConfig, firstToken []byte, s *c outToken []byte needContinue bool ) - outToken, srcName, needContinue, err = gssAPIServer.AcceptSecContext(firstToken) + outToken, srcName, needContinue, err = gssAPIServer.AcceptSecContext(token) if err != nil { return err, nil, nil } @@ -369,6 +369,7 @@ func gssExchangeToken(gssapiConfig *GSSAPIWithMICConfig, firstToken []byte, s *c if err := Unmarshal(packet, userAuthGSSAPITokenReq); err != nil { return nil, nil, err } + token = userAuthGSSAPITokenReq.Token } packet, err := s.transport.readPacket() if err != nil { From 270bf2552c05c1943a1c950e3afa3a15663e0277 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Thu, 9 Nov 2023 22:14:40 +0000 Subject: [PATCH 14/95] curve25519/internal/field/_asm: go mod tidy to fix x/sys version Relates to CL 540537, which updated the dependency in the main module. Change-Id: I9a745f4e03b5cf14fa62c4de63363ddf663b19fd GitHub-Last-Rev: 836c39364e9fe4302bc26efc9dabc47680cb66d2 GitHub-Pull-Request: golang/crypto#277 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/541276 Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Knyszek Reviewed-by: Roland Shoemaker Reviewed-by: Filippo Valsorda --- curve25519/internal/field/_asm/go.mod | 2 +- curve25519/internal/field/_asm/go.sum | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/curve25519/internal/field/_asm/go.mod b/curve25519/internal/field/_asm/go.mod index bf6dbc73cd..f6902f4a64 100644 --- a/curve25519/internal/field/_asm/go.mod +++ b/curve25519/internal/field/_asm/go.mod @@ -9,7 +9,7 @@ require ( require ( golang.org/x/mod v0.8.0 // indirect - golang.org/x/sys v0.13.0 // indirect + golang.org/x/sys v0.14.0 // indirect golang.org/x/tools v0.6.0 // indirect ) diff --git a/curve25519/internal/field/_asm/go.sum b/curve25519/internal/field/_asm/go.sum index 3f57ad913c..96b7915d4f 100644 --- a/curve25519/internal/field/_asm/go.sum +++ b/curve25519/internal/field/_asm/go.sum @@ -26,21 +26,21 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= +golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= From 1c17e20020f974158d1b45be166660c999d6269b Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 16 Jul 2023 14:25:08 +0200 Subject: [PATCH 15/95] ssh: fix certificate authentication with OpenSSH 7.2-7.7 OpenSSH 7.2-7.7 advertises support for rsa-sha2-256 and rsa-sha2-512 in the "server-sig-algs" extension but doesn't support these algorithms for certificate authentication, so if the server rejects the key try to use the obtained algorithm as if "server-sig-algs" had not been implemented. Fixes golang/go#58371 Change-Id: Id49960d3dedd32a21e2c6c2689b1696e05398286 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/510155 Reviewed-by: Filippo Valsorda Run-TryBot: Nicola Murino Reviewed-by: Dmitri Shuralyov Reviewed-by: Michael Knyszek TryBot-Result: Gopher Robot LUCI-TryBot-Result: Go LUCI Auto-Submit: Nicola Murino --- ssh/client_auth.go | 20 +++++++++++++++++++- ssh/common.go | 8 ++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/ssh/client_auth.go b/ssh/client_auth.go index 5c3bc25723..34bf089d0b 100644 --- a/ssh/client_auth.go +++ b/ssh/client_auth.go @@ -307,7 +307,10 @@ func (cb publicKeyCallback) auth(session []byte, user string, c packetConn, rand } var methods []string var errSigAlgo error - for _, signer := range signers { + + origSignersLen := len(signers) + for idx := 0; idx < len(signers); idx++ { + signer := signers[idx] pub := signer.PublicKey() as, algo, err := pickSignatureAlgorithm(signer, extensions) if err != nil && errSigAlgo == nil { @@ -321,6 +324,21 @@ func (cb publicKeyCallback) auth(session []byte, user string, c packetConn, rand if err != nil { return authFailure, nil, err } + // OpenSSH 7.2-7.7 advertises support for rsa-sha2-256 and rsa-sha2-512 + // in the "server-sig-algs" extension but doesn't support these + // algorithms for certificate authentication, so if the server rejects + // the key try to use the obtained algorithm as if "server-sig-algs" had + // not been implemented if supported from the algorithm signer. + if !ok && idx < origSignersLen && isRSACert(algo) && algo != CertAlgoRSAv01 { + if contains(as.Algorithms(), KeyAlgoRSA) { + // We retry using the compat algorithm after all signers have + // been tried normally. + signers = append(signers, &multiAlgorithmSigner{ + AlgorithmSigner: as, + supportedAlgorithms: []string{KeyAlgoRSA}, + }) + } + } if !ok { continue } diff --git a/ssh/common.go b/ssh/common.go index dd2ab0d69a..7e9c2cbc64 100644 --- a/ssh/common.go +++ b/ssh/common.go @@ -127,6 +127,14 @@ func isRSA(algo string) bool { return contains(algos, underlyingAlgo(algo)) } +func isRSACert(algo string) bool { + _, ok := certKeyAlgoNames[algo] + if !ok { + return false + } + return isRSA(algo) +} + // supportedPubKeyAuthAlgos specifies the supported client public key // authentication algorithms. Note that this doesn't include certificate types // since those use the underlying algorithm. This list is sent to the client if From b2d7c26edb17864f117d8b0ee73c1843bcc6090f Mon Sep 17 00:00:00 2001 From: Randy Reddig Date: Thu, 22 Jun 2023 16:06:05 +0000 Subject: [PATCH 16/95] ssh: add (*Client).DialContext method This change adds DialContext to ssh.Client, which opens a TCP-IP connection tunneled over the SSH connection. This is useful for proxying network connections, e.g. setting (net/http.Transport).DialContext. Fixes golang/go#20288. Change-Id: I110494c00962424ea803065535ebe2209364ac27 GitHub-Last-Rev: 3176984a71a9a1422702e3a071340ecfff71ff62 GitHub-Pull-Request: golang/crypto#260 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/504735 Run-TryBot: Nicola Murino Run-TryBot: Han-Wen Nienhuys Auto-Submit: Nicola Murino Reviewed-by: Han-Wen Nienhuys Reviewed-by: Dmitri Shuralyov TryBot-Result: Gopher Robot Reviewed-by: Nicola Murino Commit-Queue: Nicola Murino --- ssh/tcpip.go | 35 +++++++++++++++++++++++++++++++++++ ssh/tcpip_test.go | 33 +++++++++++++++++++++++++++++++++ ssh/test/dial_unix_test.go | 7 ++++++- 3 files changed, 74 insertions(+), 1 deletion(-) diff --git a/ssh/tcpip.go b/ssh/tcpip.go index 80d35f5ec1..ef5059a11d 100644 --- a/ssh/tcpip.go +++ b/ssh/tcpip.go @@ -5,6 +5,7 @@ package ssh import ( + "context" "errors" "fmt" "io" @@ -332,6 +333,40 @@ func (l *tcpListener) Addr() net.Addr { return l.laddr } +// DialContext initiates a connection to the addr from the remote host. +// +// The provided Context must be non-nil. If the context expires before the +// connection is complete, an error is returned. Once successfully connected, +// any expiration of the context will not affect the connection. +// +// See func Dial for additional information. +func (c *Client) DialContext(ctx context.Context, n, addr string) (net.Conn, error) { + if err := ctx.Err(); err != nil { + return nil, err + } + type connErr struct { + conn net.Conn + err error + } + ch := make(chan connErr) + go func() { + conn, err := c.Dial(n, addr) + select { + case ch <- connErr{conn, err}: + case <-ctx.Done(): + if conn != nil { + conn.Close() + } + } + }() + select { + case res := <-ch: + return res.conn, res.err + case <-ctx.Done(): + return nil, ctx.Err() + } +} + // Dial initiates a connection to the addr from the remote host. // The resulting connection has a zero LocalAddr() and RemoteAddr(). func (c *Client) Dial(n, addr string) (net.Conn, error) { diff --git a/ssh/tcpip_test.go b/ssh/tcpip_test.go index f1265cb496..4d85114727 100644 --- a/ssh/tcpip_test.go +++ b/ssh/tcpip_test.go @@ -5,7 +5,10 @@ package ssh import ( + "context" + "net" "testing" + "time" ) func TestAutoPortListenBroken(t *testing.T) { @@ -18,3 +21,33 @@ func TestAutoPortListenBroken(t *testing.T) { t.Errorf("version %q marked as broken", works) } } + +func TestClientImplementsDialContext(t *testing.T) { + type ContextDialer interface { + DialContext(context.Context, string, string) (net.Conn, error) + } + // Belt and suspenders assertion, since package net does not + // declare a ContextDialer type. + var _ ContextDialer = &net.Dialer{} + var _ ContextDialer = &Client{} +} + +func TestClientDialContextWithCancel(t *testing.T) { + c := &Client{} + ctx, cancel := context.WithCancel(context.Background()) + cancel() + _, err := c.DialContext(ctx, "tcp", "localhost:1000") + if err != context.Canceled { + t.Errorf("DialContext: got nil error, expected %v", context.Canceled) + } +} + +func TestClientDialContextWithDeadline(t *testing.T) { + c := &Client{} + ctx, cancel := context.WithDeadline(context.Background(), time.Now()) + defer cancel() + _, err := c.DialContext(ctx, "tcp", "localhost:1000") + if err != context.DeadlineExceeded { + t.Errorf("DialContext: got nil error, expected %v", context.DeadlineExceeded) + } +} diff --git a/ssh/test/dial_unix_test.go b/ssh/test/dial_unix_test.go index 0a5f5e395f..8ec8d50a50 100644 --- a/ssh/test/dial_unix_test.go +++ b/ssh/test/dial_unix_test.go @@ -9,6 +9,7 @@ package test // direct-tcpip and direct-streamlocal functional tests import ( + "context" "fmt" "io" "net" @@ -46,7 +47,11 @@ func testDial(t *testing.T, n, listenAddr string, x dialTester) { } }() - conn, err := sshConn.Dial(n, l.Addr().String()) + ctx, cancel := context.WithCancel(context.Background()) + conn, err := sshConn.DialContext(ctx, n, l.Addr().String()) + // Canceling the context after dial should have no effect + // on the opened connection. + cancel() if err != nil { t.Fatalf("Dial: %v", err) } From 1eadac50a566dfaa1b603ca15e8ad3cbd1c77b20 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 27 Nov 2023 15:56:14 +0000 Subject: [PATCH 17/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I7fdfe509173c79a63d006b27d674f869a5baa2af Reviewed-on: https://go-review.googlesource.com/c/crypto/+/545098 Reviewed-by: Heschi Kreinick LUCI-TryBot-Result: Go LUCI Auto-Submit: Gopher Robot Reviewed-by: Dmitri Shuralyov --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 90eec5425c..d676a454ad 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.10.0 // tagx:ignore - golang.org/x/sys v0.14.0 - golang.org/x/term v0.14.0 + golang.org/x/sys v0.15.0 + golang.org/x/term v0.15.0 ) require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index 49ce5c4aee..f9352ee97e 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= -golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.14.0 h1:LGK9IlZ8T9jvdy6cTdfKUCltatMFOehAQo9SRC46UQ8= -golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 325b735346247f48971d2b37d24dd180a35f391f Mon Sep 17 00:00:00 2001 From: Heschi Kreinick Date: Mon, 27 Nov 2023 11:17:04 -0500 Subject: [PATCH 18/95] ssh/test: skip TestSSHCLIAuth on Windows It's failing with a file permissions error: sign_and_send_pubkey: signing using rsa-sha2-512 SHA256:Anr3LjZK8YVpjrxu79myrW9Hrb/wpcMNpVvTq/RcBm8\r\nBad permissions. Try removing permissions for user: UNKNOWN\\\\UNKNOWN (S-1-15-2-2) on file C:/b/s/w/ir/x/t/TestSSHCLIAuth1586735692/001/rsa. For golang/go#64403 Change-Id: Iece8eac4a1ac349f9f7a273ac7389315cb96568e Cq-Include-Trybots: luci.golang.try:x_crypto-gotip-windows-amd64-longtest,x_crypto-go1.21-windows-amd64-longtest,x_crypto-go1.20-windows-amd64-longtest Reviewed-on: https://go-review.googlesource.com/c/crypto/+/545135 Reviewed-by: Dmitri Shuralyov Reviewed-by: Nicola Murino Auto-Submit: Heschi Kreinick LUCI-TryBot-Result: Go LUCI --- ssh/test/sshcli_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ssh/test/sshcli_test.go b/ssh/test/sshcli_test.go index d3b85d77e2..ac2f7c10a9 100644 --- a/ssh/test/sshcli_test.go +++ b/ssh/test/sshcli_test.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "testing" "golang.org/x/crypto/internal/testenv" @@ -34,6 +35,9 @@ func sshClient(t *testing.T) string { } func TestSSHCLIAuth(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skipf("always fails on Windows, see #64403") + } sshCLI := sshClient(t) dir := t.TempDir() keyPrivPath := filepath.Join(dir, "rsa") From bda2f3f5cfce3f27039acccd823693f6d67c2a74 Mon Sep 17 00:00:00 2001 From: Egon Elbre Date: Thu, 1 Jul 2021 18:56:44 +0300 Subject: [PATCH 19/95] argon2: avoid clobbering BP go vet was reporting blamka_amd64.s:203:1: [amd64] mixBlocksSSE2: invalid offset a+24(FP); expected a+8(FP) blamka_amd64.s:226:1: [amd64] xorBlocksSSE2: invalid offset a+24(FP); expected a+8(FP) blamka_amd64.s:204:1: frame pointer is clobbered before saving blamka_amd64.s:227:1: frame pointer is clobbered before saving Also fix a similar naming issue in sha3: sha3\keccakf_amd64.s:325:1: [amd64] keccakF1600: unknown variable state; offset 0 is a+0(FP) Updates golang/go#47027 Change-Id: Ia74852cdb0721ae0216787054197b0cac9e1c0f8 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/332289 Reviewed-by: Michael Knyszek Reviewed-by: Nicola Murino Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Auto-Submit: Dmitri Shuralyov Reviewed-by: Filippo Valsorda --- argon2/blamka_amd64.s | 12 ++++++------ sha3/keccakf_amd64.s | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/argon2/blamka_amd64.s b/argon2/blamka_amd64.s index f3b653a12f..6713accac0 100644 --- a/argon2/blamka_amd64.s +++ b/argon2/blamka_amd64.s @@ -199,8 +199,8 @@ TEXT ·mixBlocksSSE2(SB), 4, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX - MOVQ a+24(FP), CX - MOVQ $128, BP + MOVQ c+24(FP), CX + MOVQ $128, DI loop: MOVOU 0(AX), X0 @@ -213,7 +213,7 @@ loop: ADDQ $16, BX ADDQ $16, CX ADDQ $16, DX - SUBQ $2, BP + SUBQ $2, DI JA loop RET @@ -222,8 +222,8 @@ TEXT ·xorBlocksSSE2(SB), 4, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX - MOVQ a+24(FP), CX - MOVQ $128, BP + MOVQ c+24(FP), CX + MOVQ $128, DI loop: MOVOU 0(AX), X0 @@ -238,6 +238,6 @@ loop: ADDQ $16, BX ADDQ $16, CX ADDQ $16, DX - SUBQ $2, BP + SUBQ $2, DI JA loop RET diff --git a/sha3/keccakf_amd64.s b/sha3/keccakf_amd64.s index 8fb26aebb2..1f53938861 100644 --- a/sha3/keccakf_amd64.s +++ b/sha3/keccakf_amd64.s @@ -319,9 +319,9 @@ MOVQ rDi, _si(oState); \ MOVQ rDo, _so(oState) \ -// func keccakF1600(state *[25]uint64) +// func keccakF1600(a *[25]uint64) TEXT ·keccakF1600(SB), 0, $200-8 - MOVQ state+0(FP), rpState + MOVQ a+0(FP), rpState // Convert the user state into an internal state NOTQ _be(rpState) From 7e6fbd82c804e1760feb603fe21caecb0af0a124 Mon Sep 17 00:00:00 2001 From: Pavel Repin Date: Mon, 27 Nov 2023 16:26:11 +0000 Subject: [PATCH 20/95] ssh: wrap errors from client handshake When an error is returned by a user defined host key callback, it is now possible to handle it using standard Go mechanisms such as errors.Is or errors.As. Fixes golang/go#61309 Change-Id: I4269c5f8eacd8e7e8d85070ad249f0e27777b15f GitHub-Last-Rev: d2a34d5c8225d6aaaee287ce3ea8b218fbe210d4 GitHub-Pull-Request: golang/crypto#266 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/508876 Run-TryBot: Nicola Murino Auto-Submit: Dmitri Shuralyov Reviewed-by: Muhammad Shulhan Reviewed-by: Michael Knyszek Reviewed-by: Dmitri Shuralyov Reviewed-by: Nicola Murino TryBot-Result: Gopher Robot --- ssh/client.go | 2 +- ssh/client_test.go | 25 +++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/ssh/client.go b/ssh/client.go index bdc356cbdf..fd8c49749e 100644 --- a/ssh/client.go +++ b/ssh/client.go @@ -82,7 +82,7 @@ func NewClientConn(c net.Conn, addr string, config *ClientConfig) (Conn, <-chan if err := conn.clientHandshake(addr, &fullConf); err != nil { c.Close() - return nil, nil, nil, fmt.Errorf("ssh: handshake failed: %v", err) + return nil, nil, nil, fmt.Errorf("ssh: handshake failed: %w", err) } conn.mux = newMux(conn.transport) return conn, conn.mux.incomingChannels, conn.mux.incomingRequests, nil diff --git a/ssh/client_test.go b/ssh/client_test.go index c114573469..2621f0ea52 100644 --- a/ssh/client_test.go +++ b/ssh/client_test.go @@ -7,6 +7,9 @@ package ssh import ( "bytes" "crypto/rand" + "errors" + "fmt" + "net" "strings" "testing" ) @@ -207,9 +210,12 @@ func TestBannerCallback(t *testing.T) { } func TestNewClientConn(t *testing.T) { + errHostKeyMismatch := errors.New("host key mismatch") + for _, tt := range []struct { - name string - user string + name string + user string + simulateHostKeyMismatch HostKeyCallback }{ { name: "good user field for ConnMetadata", @@ -219,6 +225,13 @@ func TestNewClientConn(t *testing.T) { name: "empty user field for ConnMetadata", user: "", }, + { + name: "host key mismatch", + user: "testuser", + simulateHostKeyMismatch: func(hostname string, remote net.Addr, key PublicKey) error { + return fmt.Errorf("%w: %s", errHostKeyMismatch, bytes.TrimSpace(MarshalAuthorizedKey(key))) + }, + }, } { t.Run(tt.name, func(t *testing.T) { c1, c2, err := netPipe() @@ -243,8 +256,16 @@ func TestNewClientConn(t *testing.T) { }, HostKeyCallback: InsecureIgnoreHostKey(), } + + if tt.simulateHostKeyMismatch != nil { + clientConf.HostKeyCallback = tt.simulateHostKeyMismatch + } + clientConn, _, _, err := NewClientConn(c2, "", clientConf) if err != nil { + if tt.simulateHostKeyMismatch != nil && errors.Is(err, errHostKeyMismatch) { + return + } t.Fatal(err) } From b8ffc16e10063067bac0e15c6d7f7995937503ce Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Thu, 9 Nov 2023 22:33:39 +0000 Subject: [PATCH 21/95] blake2b: drop Go 1.6, Go 1.8 compatibility Other packages already dropped compatibility with go < 1.12, so it should be safe to remove it for this package as well. Change-Id: I7e894fd11d2e7d1fe28c647bd921399a9a6e30d0 GitHub-Last-Rev: 2b4f576a19338f185e79f39cbfd476573b986369 GitHub-Pull-Request: golang/crypto#240 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/448240 Run-TryBot: Nicola Murino Reviewed-by: Joedian Reid Reviewed-by: Nicola Murino Commit-Queue: Nicola Murino Auto-Submit: Nicola Murino TryBot-Result: Gopher Robot Reviewed-by: Michael Knyszek Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov --- blake2b/blake2bAVX2_amd64.go | 2 +- blake2b/blake2bAVX2_amd64.s | 2 +- blake2b/blake2b_amd64.go | 24 ------------------------ blake2b/register.go | 2 -- 4 files changed, 2 insertions(+), 28 deletions(-) delete mode 100644 blake2b/blake2b_amd64.go diff --git a/blake2b/blake2bAVX2_amd64.go b/blake2b/blake2bAVX2_amd64.go index 4f506f8791..199c21d27a 100644 --- a/blake2b/blake2bAVX2_amd64.go +++ b/blake2b/blake2bAVX2_amd64.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.7 && amd64 && gc && !purego +//go:build amd64 && gc && !purego package blake2b diff --git a/blake2b/blake2bAVX2_amd64.s b/blake2b/blake2bAVX2_amd64.s index 353bb7cac5..9ae8206c20 100644 --- a/blake2b/blake2bAVX2_amd64.s +++ b/blake2b/blake2bAVX2_amd64.s @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.7 && amd64 && gc && !purego +//go:build amd64 && gc && !purego #include "textflag.h" diff --git a/blake2b/blake2b_amd64.go b/blake2b/blake2b_amd64.go deleted file mode 100644 index 1d0770abba..0000000000 --- a/blake2b/blake2b_amd64.go +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.7 && amd64 && gc && !purego - -package blake2b - -import "golang.org/x/sys/cpu" - -func init() { - useSSE4 = cpu.X86.HasSSE41 -} - -//go:noescape -func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) - -func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) { - if useSSE4 { - hashBlocksSSE4(h, c, flag, blocks) - } else { - hashBlocksGeneric(h, c, flag, blocks) - } -} diff --git a/blake2b/register.go b/blake2b/register.go index d9fcac3a4d..54e446e1d2 100644 --- a/blake2b/register.go +++ b/blake2b/register.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.9 - package blake2b import ( From fdfe1f8531a1adcc300c8eba98cb372044826d62 Mon Sep 17 00:00:00 2001 From: Will Mortensen Date: Fri, 16 Dec 2022 15:56:24 -0800 Subject: [PATCH 22/95] ssh: defer channel window adjustment Sending a window adjustment after every read is unnecessarily chatty, especially with a series of small reads like with TTY interactions. Copy OpenSSH's logic for deferring these, which seemingly hasn't changed since 2007. Note that since channelWindowSize and c.maxIncomingPayload are currently constants here, the two checks could be combined into a single check for c.myWindow < 2 MiB - 96 KiB (with the current values of the constants). Fixes golang/go#57424. Change-Id: Ifcef5be76fcc3f0b1a6dc396096bed9c50d64f21 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/459915 Reviewed-by: Nicola Murino Reviewed-by: Michael Knyszek Run-TryBot: Nicola Murino Auto-Submit: Nicola Murino Reviewed-by: Dmitri Shuralyov Commit-Queue: Nicola Murino TryBot-Result: Gopher Robot --- ssh/channel.go | 28 +++++++++++++----- ssh/mempipe_test.go | 20 +++++++++++-- ssh/mux_test.go | 71 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 11 deletions(-) diff --git a/ssh/channel.go b/ssh/channel.go index c0834c00df..cc0bb7ab64 100644 --- a/ssh/channel.go +++ b/ssh/channel.go @@ -187,9 +187,11 @@ type channel struct { pending *buffer extPending *buffer - // windowMu protects myWindow, the flow-control window. - windowMu sync.Mutex - myWindow uint32 + // windowMu protects myWindow, the flow-control window, and myConsumed, + // the number of bytes consumed since we last increased myWindow + windowMu sync.Mutex + myWindow uint32 + myConsumed uint32 // writeMu serializes calls to mux.conn.writePacket() and // protects sentClose and packetPool. This mutex must be @@ -332,14 +334,24 @@ func (ch *channel) handleData(packet []byte) error { return nil } -func (c *channel) adjustWindow(n uint32) error { +func (c *channel) adjustWindow(adj uint32) error { c.windowMu.Lock() - // Since myWindow is managed on our side, and can never exceed - // the initial window setting, we don't worry about overflow. - c.myWindow += uint32(n) + // Since myConsumed and myWindow are managed on our side, and can never + // exceed the initial window setting, we don't worry about overflow. + c.myConsumed += adj + var sendAdj uint32 + if (channelWindowSize-c.myWindow > 3*c.maxIncomingPayload) || + (c.myWindow < channelWindowSize/2) { + sendAdj = c.myConsumed + c.myConsumed = 0 + c.myWindow += sendAdj + } c.windowMu.Unlock() + if sendAdj == 0 { + return nil + } return c.sendMessage(windowAdjustMsg{ - AdditionalBytes: uint32(n), + AdditionalBytes: sendAdj, }) } diff --git a/ssh/mempipe_test.go b/ssh/mempipe_test.go index 8697cd6140..f27339c51a 100644 --- a/ssh/mempipe_test.go +++ b/ssh/mempipe_test.go @@ -13,9 +13,10 @@ import ( // An in-memory packetConn. It is safe to call Close and writePacket // from different goroutines. type memTransport struct { - eof bool - pending [][]byte - write *memTransport + eof bool + pending [][]byte + write *memTransport + writeCount uint64 sync.Mutex *sync.Cond } @@ -63,9 +64,16 @@ func (t *memTransport) writePacket(p []byte) error { copy(c, p) t.write.pending = append(t.write.pending, c) t.write.Cond.Signal() + t.writeCount++ return nil } +func (t *memTransport) getWriteCount() uint64 { + t.write.Lock() + defer t.write.Unlock() + return t.writeCount +} + func memPipe() (a, b packetConn) { t1 := memTransport{} t2 := memTransport{} @@ -81,6 +89,9 @@ func TestMemPipe(t *testing.T) { if err := a.writePacket([]byte{42}); err != nil { t.Fatalf("writePacket: %v", err) } + if wc := a.(*memTransport).getWriteCount(); wc != 1 { + t.Fatalf("got %v, want 1", wc) + } if err := a.Close(); err != nil { t.Fatal("Close: ", err) } @@ -95,6 +106,9 @@ func TestMemPipe(t *testing.T) { if err != io.EOF { t.Fatalf("got %v, %v, want EOF", p, err) } + if wc := b.(*memTransport).getWriteCount(); wc != 0 { + t.Fatalf("got %v, want 0", wc) + } } func TestDoubleClose(t *testing.T) { diff --git a/ssh/mux_test.go b/ssh/mux_test.go index eae637d5e2..21f0ac3e32 100644 --- a/ssh/mux_test.go +++ b/ssh/mux_test.go @@ -182,6 +182,40 @@ func TestMuxChannelOverflow(t *testing.T) { } } +func TestMuxChannelReadUnblock(t *testing.T) { + reader, writer, mux := channelPair(t) + defer reader.Close() + defer writer.Close() + defer mux.Close() + + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + wg.Add(1) + go func() { + defer wg.Done() + if _, err := writer.Write(make([]byte, channelWindowSize)); err != nil { + t.Errorf("could not fill window: %v", err) + } + if _, err := writer.Write(make([]byte, 1)); err != nil { + t.Errorf("Write: %v", err) + } + writer.Close() + }() + + writer.remoteWin.waitWriterBlocked() + + buf := make([]byte, 32768) + for { + _, err := reader.Read(buf) + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("Read: %v", err) + } + } +} + func TestMuxChannelCloseWriteUnblock(t *testing.T) { reader, writer, mux := channelPair(t) defer reader.Close() @@ -754,6 +788,43 @@ func TestMuxMaxPacketSize(t *testing.T) { } } +func TestMuxChannelWindowDeferredUpdates(t *testing.T) { + s, c, mux := channelPair(t) + cTransport := mux.conn.(*memTransport) + defer s.Close() + defer c.Close() + defer mux.Close() + + var wg sync.WaitGroup + t.Cleanup(wg.Wait) + + data := make([]byte, 1024) + + wg.Add(1) + go func() { + defer wg.Done() + _, err := s.Write(data) + if err != nil { + t.Errorf("Write: %v", err) + return + } + }() + cWritesInit := cTransport.getWriteCount() + buf := make([]byte, 1) + for i := 0; i < len(data); i++ { + n, err := c.Read(buf) + if n != len(buf) || err != nil { + t.Fatalf("Read: %v, %v", n, err) + } + } + cWrites := cTransport.getWriteCount() - cWritesInit + // reading 1 KiB should not cause any window updates to be sent, but allow + // for some unexpected writes + if cWrites > 30 { + t.Fatalf("reading 1 KiB from channel caused %v writes", cWrites) + } +} + // Don't ship code with debug=true. func TestDebug(t *testing.T) { if debugMux { From 152cdb1503ebc13bc0fbb68f92ee189ebf9e3d00 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 20 Nov 2023 16:01:22 +0000 Subject: [PATCH 23/95] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: Ib8c85dc815297de7b59c3e23b0ad029baaf948ec Reviewed-on: https://go-review.googlesource.com/c/crypto/+/543735 Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker --- x509roots/fallback/bundle.go | 187 ++++++++++++++++++++++++++++------- 1 file changed, 150 insertions(+), 37 deletions(-) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index 542ac87217..ad9ca4db1e 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -446,43 +446,6 @@ f9DEMnDAGf/JOC0ULGb0QkTmVXYbgBVX/8Cnp6o5qtjTcNAuuuuUavpfNIbnYrX9 ivAwhZTJryQCL2/W3Wf+47BVTwSYT6RBVuKT0Gro1vP7ZeDOdcQxWQzugsgMYDNK GbqEZycPvEJdvSRUDewdcAZfpLz6IHxV -----END CERTIFICATE----- -# CN=Autoridad de Certificacion Firmaprofesional CIF A62634068,C=ES -# 04048028bf1f2864d48f9ad4d83294366a828856553f3b14303f90147f5d40ef ------BEGIN CERTIFICATE----- -MIIGFDCCA/ygAwIBAgIIU+w77vuySF8wDQYJKoZIhvcNAQEFBQAwUTELMAkGA1UE -BhMCRVMxQjBABgNVBAMMOUF1dG9yaWRhZCBkZSBDZXJ0aWZpY2FjaW9uIEZpcm1h -cHJvZmVzaW9uYWwgQ0lGIEE2MjYzNDA2ODAeFw0wOTA1MjAwODM4MTVaFw0zMDEy -MzEwODM4MTVaMFExCzAJBgNVBAYTAkVTMUIwQAYDVQQDDDlBdXRvcmlkYWQgZGUg -Q2VydGlmaWNhY2lvbiBGaXJtYXByb2Zlc2lvbmFsIENJRiBBNjI2MzQwNjgwggIi -MA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDKlmuO6vj78aI14H9M2uDDUtd9 -thDIAl6zQyrET2qyyhxdKJp4ERppWVevtSBC5IsP5t9bpgOSL/UR5GLXMnE42QQM -cas9UX4PB99jBVzpv5RvwSmCwLTaUbDBPLutN0pcyvFLNg4kq7/DhHf9qFD0sefG -L9ItWY16Ck6WaVICqjaY7Pz6FIMMNx/Jkjd/14Et5cS54D40/mf0PmbR0/RAz15i -NA9wBj4gGFrO93IbJWyTdBSTo3OxDqqHECNZXyAFGUftaI6SEspd/NYrspI8IM/h -X68gvqB2f3bl7BqGYTM+53u0P6APjqK5am+5hyZvQWyIplD9amML9ZMWGxmPsu2b -m8mQ9QEM3xk9Dz44I8kvjwzRAv4bVdZO0I08r0+k8/6vKtMFnXkIoctXMbScyJCy -Z/QYFpM6/EfY0XiWMR+6KwxfXZmtY4laJCB22N/9q06mIqqdXuYnin1oKaPnirja -EbsXLZmdEyRG98Xi2J+Of8ePdG1asuhy9azuJBCtLxTa/y2aRnFHvkLfuwHb9H/T -KI8xWVvTyQKmtFLKbpf7Q8UIJm+K9Lv9nyiqDdVF8xM6HdjAeI9BZzwelGSuewvF -6NkBiDkal4ZkQdU7hwxu+g/GvUgUvzlN1J5Bto+WHWOWk9mVBngxaJ43BjuAiUVh -OSPHG0SjFeUc+JIwuwIDAQABo4HvMIHsMBIGA1UdEwEB/wQIMAYBAf8CAQEwDgYD -VR0PAQH/BAQDAgEGMB0GA1UdDgQWBBRlzeurNR4APn7VdMActHNHDhpkLzCBpgYD -VR0gBIGeMIGbMIGYBgRVHSAAMIGPMC8GCCsGAQUFBwIBFiNodHRwOi8vd3d3LmZp -cm1hcHJvZmVzaW9uYWwuY29tL2NwczBcBggrBgEFBQcCAjBQHk4AUABhAHMAZQBv -ACAAZABlACAAbABhACAAQgBvAG4AYQBuAG8AdgBhACAANAA3ACAAQgBhAHIAYwBl -AGwAbwBuAGEAIAAwADgAMAAxADcwDQYJKoZIhvcNAQEFBQADggIBABd9oPm03cXF -661LJLWhAqvdpYhKsg9VSytXjDvlMd3+xDLx51tkljYyGOylMnfX40S2wBEqgLk9 -am58m9Ot/MPWo+ZkKXzR4Tgegiv/J2Wv+xYVxC5xhOW1//qkR71kMrv2JYSiJ0L1 -ILDCExARzRAVukKQKtJE4ZYm6zFIEv0q2skGz3QeqUvVhyj5eTSSPi5E6PaPT481 -PyWzOdxjKpBrIF/EUhJOlywqrJ2X3kjyo2bbwtKDlaZmp54lD+kLM5FlClrD2VQS -3a/DTg4fJl4N3LON7NWBcN7STyQF82xO9UxJZo3R/9ILJUFI/lGExkKvgATP0H5k -SeTy36LssUzAKh3ntLFlosS88Zj0qnAHY7S42jtM+kAiMFsRpvAFDsYCA0irhpuF -3dvd6qJ2gHN99ZwExEWN57kci57q13XRcrHedUTnQn3iV2t93Jm8PYMo6oCTjcVM -ZcFwgbg4/EMxsvYDNEeyrPsiBsse3RdHHF9mudMaotoRsaS8I8nkvof/uZS2+F0g -StRf571oe2XyFR7SOqkt6dhrJKyXWERHrVkY8SFlcN7ONGCoQPHzPKTDKCOM/icz -Q0CgFzzr6juwcqajuUpLXhZI9LK8yIySxZ2frHI2vDSANGupi5LAuBft7HZT9SQB -jLMi6Et8Vcad+qMUu2WFbm5PEn4KPJ2V ------END CERTIFICATE----- # CN=BJCA Global Root CA1,O=BEIJING CERTIFICATE AUTHORITY,C=CN # f3896f88fe7c0a882766a7fa6ad2749fb57a7f3e98fb769c1fa7b09c2c44d5ae -----BEGIN CERTIFICATE----- @@ -992,6 +955,104 @@ qJZ9ZPskWkoDbGs4xugDQ5r3V7mzKWmTOPQD8rv7gmsHINFSH5pkAnuYZttcTVoP 0ISVoDwUQwbKytu4QTbaakRnh6+v40URFWkIsr4WOZckbxJF0WddCajJFdr60qZf E2Efv4WstK2tBZQIgx51F9NxO5NQI1mg7TyRVJ12AMXDuDjb -----END CERTIFICATE----- +# CN=CommScope Public Trust ECC Root-01,O=CommScope,C=US +# 11437cda7bb45e41365f45b39a38986b0de00def348e0c7bb0873633800bc38b +-----BEGIN CERTIFICATE----- +MIICHTCCAaOgAwIBAgIUQ3CCd89NXTTxyq4yLzf39H91oJ4wCgYIKoZIzj0EAwMw +TjELMAkGA1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29t +bVNjb3BlIFB1YmxpYyBUcnVzdCBFQ0MgUm9vdC0wMTAeFw0yMTA0MjgxNzM1NDNa +Fw00NjA0MjgxNzM1NDJaME4xCzAJBgNVBAYTAlVTMRIwEAYDVQQKDAlDb21tU2Nv +cGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3QgRUNDIFJvb3QtMDEw +djAQBgcqhkjOPQIBBgUrgQQAIgNiAARLNumuV16ocNfQj3Rid8NeeqrltqLxeP0C +flfdkXmcbLlSiFS8LwS+uM32ENEp7LXQoMPwiXAZu1FlxUOcw5tjnSCDPgYLpkJE +hRGnSjot6dZoL0hOUysHP029uax3OVejQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYD +VR0PAQH/BAQDAgEGMB0GA1UdDgQWBBSOB2LAUN3GGQYARnQE9/OufXVNMDAKBggq +hkjOPQQDAwNoADBlAjEAnDPfQeMjqEI2Jpc1XHvr20v4qotzVRVcrHgpD7oh2MSg +2NED3W3ROT3Ek2DS43KyAjB8xX6I01D1HiXo+k515liWpDVfG2XqYZpwI7UNo5uS +Um9poIyNStDuiw7LR47QjRE= +-----END CERTIFICATE----- +# CN=CommScope Public Trust ECC Root-02,O=CommScope,C=US +# 2ffb7f813bbbb3c89ab4e8162d0f16d71509a830cc9d73c262e5140875d1ad4a +-----BEGIN CERTIFICATE----- +MIICHDCCAaOgAwIBAgIUKP2ZYEFHpgE6yhR7H+/5aAiDXX0wCgYIKoZIzj0EAwMw +TjELMAkGA1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29t +bVNjb3BlIFB1YmxpYyBUcnVzdCBFQ0MgUm9vdC0wMjAeFw0yMTA0MjgxNzQ0NTRa +Fw00NjA0MjgxNzQ0NTNaME4xCzAJBgNVBAYTAlVTMRIwEAYDVQQKDAlDb21tU2Nv +cGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3QgRUNDIFJvb3QtMDIw +djAQBgcqhkjOPQIBBgUrgQQAIgNiAAR4MIHoYx7l63FRD/cHB8o5mXxO1Q/MMDAL +j2aTPs+9xYa9+bG3tD60B8jzljHz7aRP+KNOjSkVWLjVb3/ubCK1sK9IRQq9qEmU +v4RDsNuESgMjGWdqb8FuvAY5N9GIIvejQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYD +VR0PAQH/BAQDAgEGMB0GA1UdDgQWBBTmGHX/72DehKT1RsfeSlXjMjZ59TAKBggq +hkjOPQQDAwNnADBkAjAmc0l6tqvmSfR9Uj/UQQSugEODZXW5hYA4O9Zv5JOGq4/n +ich/m35rChJVYaoR4HkCMHfoMXGsPHED1oQmHhS48zs73u1Z/GtMMH9ZzkXpc2AV +mkzw5l4lIhVtwodZ0LKOag== +-----END CERTIFICATE----- +# CN=CommScope Public Trust RSA Root-01,O=CommScope,C=US +# 02bdf96e2a45dd9bf18fc7e1dbdf21a0379ba3c9c2610344cfd8d606fec1ed81 +-----BEGIN CERTIFICATE----- +MIIFbDCCA1SgAwIBAgIUPgNJgXUWdDGOTKvVxZAplsU5EN0wDQYJKoZIhvcNAQEL +BQAwTjELMAkGA1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwi +Q29tbVNjb3BlIFB1YmxpYyBUcnVzdCBSU0EgUm9vdC0wMTAeFw0yMTA0MjgxNjQ1 +NTRaFw00NjA0MjgxNjQ1NTNaME4xCzAJBgNVBAYTAlVTMRIwEAYDVQQKDAlDb21t +U2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3QgUlNBIFJvb3Qt +MDEwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCwSGWjDR1C45FtnYSk +YZYSwu3D2iM0GXb26v1VWvZVAVMP8syMl0+5UMuzAURWlv2bKOx7dAvnQmtVzslh +suitQDy6uUEKBU8bJoWPQ7VAtYXR1HHcg0Hz9kXHgKKEUJdGzqAMxGBWBB0HW0al +DrJLpA6lfO741GIDuZNqihS4cPgugkY4Iw50x2tBt9Apo52AsH53k2NC+zSDO3Oj +WiE260f6GBfZumbCk6SP/F2krfxQapWsvCQz0b2If4b19bJzKo98rwjyGpg/qYFl +P8GMicWWMJoKz/TUyDTtnS+8jTiGU+6Xn6myY5QXjQ/cZip8UlF1y5mO6D1cv547 +KI2DAg+pn3LiLCuz3GaXAEDQpFSOm117RTYm1nJD68/A6g3czhLmfTifBSeolz7p +UcZsBSjBAg/pGG3svZwG1KdJ9FQFa2ww8esD1eo9anbCyxooSU1/ZOD6K9pzg4H/ +kQO9lLvkuI6cMmPNn7togbGEW682v3fuHX/3SZtS7NJ3Wn2RnU3COS3kuoL4b/JO +Hg9O5j9ZpSPcPYeoKFgo0fEbNttPxP/hjFtyjMcmAyejOQoBqsCyMWCDIqFPEgkB +Ea801M/XrmLTBQe0MXXgDW1XT2mH+VepuhX2yFJtocucH+X8eKg1mp9BFM6ltM6U +CBwJrVbl2rZJmkrqYxhTnCwuwwIDAQABo0IwQDAPBgNVHRMBAf8EBTADAQH/MA4G +A1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUN12mmnQywsL5x6YVEFm45P3luG0wDQYJ +KoZIhvcNAQELBQADggIBAK+nz97/4L1CjU3lIpbfaOp9TSp90K09FlxD533Ahuh6 +NWPxzIHIxgvoLlI1pKZJkGNRrDSsBTtXAOnTYtPZKdVUvhwQkZyybf5Z/Xn36lbQ +nmhUQo8mUuJM3y+Xpi/SB5io82BdS5pYV4jvguX6r2yBS5KPQJqTRlnLX3gWsWc+ +QgvfKNmwrZggvkN80V4aCRckjXtdlemrwWCrWxhkgPut4AZ9HcpZuPN4KWfGVh2v +trV0KnahP/t1MJ+UXjulYPPLXAziDslg+MkfFoom3ecnf+slpoq9uC02EJqxWE2a +aE9gVOX2RhOOiKy8IUISrcZKiX2bwdgt6ZYD9KJ0DLwAHb/WNyVntHKLr4W96ioD +j8z7PEQkguIBpQtZtjSNMgsSDesnwv1B10A8ckYpwIzqug/xBpMu95yo9GA+o/E4 +Xo4TwbM6l4c/ksp4qRyv0LAbJh6+cOx69TOY6lz/KwsETkPdY34Op054A5U+1C0w +lREQKC6/oAI+/15Z0wUOlV9TRe9rh9VIzRamloPh37MG88EU26fsHItdkJANclHn +YfkUyq+Dj7+vsQpZXdxc1+SWrVtgHdqul7I52Qb1dgAT+GhMIbA1xNxVssnBQVoc +icCMb3SgazNNtQEo/a2tiRc7ppqEvOuM6sRxJKi6KfkIsidWNTJf6jn7MZrVGczw +-----END CERTIFICATE----- +# CN=CommScope Public Trust RSA Root-02,O=CommScope,C=US +# ffe943d793424b4f7c440c1c3d648d5363f34b82dc87aa7a9f118fc5dee101f1 +-----BEGIN CERTIFICATE----- +MIIFbDCCA1SgAwIBAgIUVBa/O345lXGN0aoApYYNK496BU4wDQYJKoZIhvcNAQEL +BQAwTjELMAkGA1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwi +Q29tbVNjb3BlIFB1YmxpYyBUcnVzdCBSU0EgUm9vdC0wMjAeFw0yMTA0MjgxNzE2 +NDNaFw00NjA0MjgxNzE2NDJaME4xCzAJBgNVBAYTAlVTMRIwEAYDVQQKDAlDb21t +U2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3QgUlNBIFJvb3Qt +MDIwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDh+g77aAASyE3VrCLE +NQE7xVTlWXZjpX/rwcRqmL0yjReA61260WI9JSMZNRTpf4mnG2I81lDnNJUDMrG0 +kyI9p+Kx7eZ7Ti6Hmw0zdQreqjXnfuU2mKKuJZ6VszKWpCtYHu8//mI0SFHRtI1C +rWDaSWqVcN3SAOLMV2MCe5bdSZdbkk6V0/nLKR8YSvgBKtJjCW4k6YnS5cciTNxz +hkcAqg2Ijq6FfUrpuzNPDlJwnZXjfG2WWy09X6GDRl224yW4fKcZgBzqZUPckXk2 +LHR88mcGyYnJ27/aaL8j7dxrrSiDeS/sOKUNNwFnJ5rpM9kzXzehxfCrPfp4sOcs +n/Y+n2Dg70jpkEUeBVF4GiwSLFworA2iI540jwXmojPOEXcT1A6kHkIfhs1w/tku +FT0du7jyU1fbzMZ0KZwYszZ1OC4PVKH4kh+Jlk+71O6d6Ts2QrUKOyrUZHk2EOH5 +kQMreyBUzQ0ZGshBMjTRsJnhkB4BQDa1t/qp5Xd1pCKBXbCL5CcSD1SIxtuFdOa3 +wNemKfrb3vOTlycEVS8KbzfFPROvCgCpLIscgSjX74Yxqa7ybrjKaixUR9gqiC6v +wQcQeKwRoi9C8DfF8rhW3Q5iLc4tVn5V8qdE9isy9COoR+jUKgF4z2rDN6ieZdIs +5fq6M8EGRPbmz6UNp2YINIos8wIDAQABo0IwQDAPBgNVHRMBAf8EBTADAQH/MA4G +A1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUR9DnsSL/nSz12Vdgs7GxcJXvYXowDQYJ +KoZIhvcNAQELBQADggIBAIZpsU0v6Z9PIpNojuQhmaPORVMbc0RTAIFhzTHjCLqB +KCh6krm2qMhDnscTJk3C2OVVnJJdUNjCK9v+5qiXz1I6JMNlZFxHMaNlNRPDk7n3 ++VGXu6TwYofF1gbTl4MgqX67tiHCpQ2EAOHyJxCDut0DgdXdaMNmEMjRdrSzbyme +APnCKfWxkxlSaRosTKCL4BWaMS/TiJVZbuXEs1DIFAhKm4sTg7GkcrI7djNB3Nyq +pgdvHSQSn8h2vS/ZjvQs7rfSOBAkNlEv41xdgSGn2rtO/+YHqP65DSdsu3BaVXoT +6fEqSWnHX4dXTEN5bTpl6TBcQe7rd6VzEojov32u5cSoHw2OHG1QAk8mGEPej1WF +sQs3BWDJVTkSBKEqz3EWnzZRSb9wO55nnPt7eck5HHisd5FUmrh1CoFSl+NmYWvt +PjgelmFV4ZFUjO2MJB+ByRCac5krFk5yAD9UG/iNuovnFNa2RU9g7Jauwy8CTl2d +lklyALKrdVwPaFsdZcJfMw8eD/A7hvWwTruc9+olBdytoptLFwG+Qt81IR2tq670 +v64fG9PiO/yzcnMcmyiQiRM9HcEARwmWmjgb3bHPDcK0RPOWlc4yOo80nOAXx17O +rg3bhzjlP1v9mxnhMUF6cKojawHhRUzNlM47ni3niAIi9G7oyOzWPPO5std3eqx7 +-----END CERTIFICATE----- # CN=D-TRUST BR Root CA 1 2020,O=D-Trust GmbH,C=DE # e59aaa816009c22bff5b25bad37df306f049797c1f81d85ab089e657bd8f0044 -----BEGIN CERTIFICATE----- @@ -3167,6 +3228,58 @@ t88NkvuOGKmYSdGe/mBEciG5Ge3C9THxOUiIkCR1VBatzvT4aRRkOfujuLpwQMcn HL/EVlP6Y2XQ8xwOFvVrhlhNGNTkDY6lnVuR3HYkUD/GKvvZt5y11ubQ2egZixVx SK236thZiNSQvxaz2emsWWFUyBy6ysHK4bkgTI86k4mloMy/0/Z1pHWWbVY= -----END CERTIFICATE----- +# CN=TrustAsia Global Root CA G3,O=TrustAsia Technologies\, Inc.,C=CN +# e0d3226aeb1163c2e48ff9be3b50b4c6431be7bb1eacc5c36b5d5ec509039a08 +-----BEGIN CERTIFICATE----- +MIIFpTCCA42gAwIBAgIUZPYOZXdhaqs7tOqFhLuxibhxkw8wDQYJKoZIhvcNAQEM +BQAwWjELMAkGA1UEBhMCQ04xJTAjBgNVBAoMHFRydXN0QXNpYSBUZWNobm9sb2dp +ZXMsIEluYy4xJDAiBgNVBAMMG1RydXN0QXNpYSBHbG9iYWwgUm9vdCBDQSBHMzAe +Fw0yMTA1MjAwMjEwMTlaFw00NjA1MTkwMjEwMTlaMFoxCzAJBgNVBAYTAkNOMSUw +IwYDVQQKDBxUcnVzdEFzaWEgVGVjaG5vbG9naWVzLCBJbmMuMSQwIgYDVQQDDBtU +cnVzdEFzaWEgR2xvYmFsIFJvb3QgQ0EgRzMwggIiMA0GCSqGSIb3DQEBAQUAA4IC +DwAwggIKAoICAQDAMYJhkuSUGwoqZdC+BqmHO1ES6nBBruL7dOoKjbmzTNyPtxNS +T1QY4SxzlZHFZjtqz6xjbYdT8PfxObegQ2OwxANdV6nnRM7EoYNl9lA+sX4WuDqK +AtCWHwDNBSHvBm3dIZwZQ0WhxeiAysKtQGIXBsaqvPPW5vxQfmZCHzyLpnl5hkA1 +nyDvP+uLRx+PjsXUjrYsyUQE49RDdT/VP68czH5GX6zfZBCK70bwkPAPLfSIC7Ep +qq+FqklYqL9joDiR5rPmd2jE+SoZhLsO4fWvieylL1AgdB4SQXMeJNnKziyhWTXA +yB1GJ2Faj/lN03J5Zh6fFZAhLf3ti1ZwA0pJPn9pMRJpxx5cynoTi+jm9WAPzJMs +hH/x/Gr8m0ed262IPfN2dTPXS6TIi/n1Q1hPy8gDVI+lhXgEGvNz8teHHUGf59gX +zhqcD0r83ERoVGjiQTz+LISGNzzNPy+i2+f3VANfWdP3kXjHi3dqFuVJhZBFcnAv +kV34PmVACxmZySYgWmjBNb9Pp1Hx2BErW+Canig7CjoKH8GB5S7wprlppYiU5msT +f9FkPz2ccEblooV7WIQn3MSAPmeamseaMQ4w7OYXQJXZRe0Blqq/DPNL0WP3E1jA +uPP6Z92bfW1K/zJMtSU7/xxnD4UiWQWRkUF3gdCFTIcQcf+eQxuulXUtgQIDAQAB +o2MwYTAPBgNVHRMBAf8EBTADAQH/MB8GA1UdIwQYMBaAFEDk5PIj7zjKsK5Xf/Ih +MBY027ySMB0GA1UdDgQWBBRA5OTyI+84yrCuV3/yITAWNNu8kjAOBgNVHQ8BAf8E +BAMCAQYwDQYJKoZIhvcNAQEMBQADggIBACY7UeFNOPMyGLS0XuFlXsSUT9SnYaP4 +wM8zAQLpw6o1D/GUE3d3NZ4tVlFEbuHGLige/9rsR82XRBf34EzC4Xx8MnpmyFq2 +XFNFV1pF1AWZLy4jVe5jaN/TG3inEpQGAHUNcoTpLrxaatXeL1nHo+zSh2bbt1S1 +JKv0Q3jbSwTEb93mPmY+KfJLaHEih6D4sTNjduMNhXJEIlU/HHzp/LgV6FL6qj6j +ITk1dImmasI5+njPtqzn59ZW/yOSLlALqbUHM/Q4X6RJpstlcHboCoWASzY9M/eV +VHUl2qzEc4Jl6VL1XP04lQJqaTDFHApXB64ipCz5xUG3uOyfT0gA+QEEVcys+TIx +xHWVBqB/0Y0n3bOppHKH/lmLmnp0Ft0WpWIp6zqW3IunaFnT63eROfjXy9mPX1on +AX1daBli2MjN9LdyR75bl87yraKZk62Uy5P2EgmVtqvXO9A/EcswFi55gORngS1d +7XB4tmBZrOFdRWOPyN9yaFvqHbgB8X7754qz41SgOAngPN5C8sLtLpvzHzW2Ntjj +gKGLzZlkD8Kqq7HK9W+eQ42EVJmzbsASZthwEPEGNTNDqJwuuhQxzhB/HIbjj9LV ++Hfsm6vxL2PZQl/gZ4FkkfGXL/xuJvYz+NO1+MRiqzFRJQJ6+N1rZdVtTTDIZbpo +FGWsJwt0ivKH +-----END CERTIFICATE----- +# CN=TrustAsia Global Root CA G4,O=TrustAsia Technologies\, Inc.,C=CN +# be4b56cb5056c0136a526df444508daa36a0b54f42e4ac38f72af470e479654c +-----BEGIN CERTIFICATE----- +MIICVTCCAdygAwIBAgIUTyNkuI6XY57GU4HBdk7LKnQV1tcwCgYIKoZIzj0EAwMw +WjELMAkGA1UEBhMCQ04xJTAjBgNVBAoMHFRydXN0QXNpYSBUZWNobm9sb2dpZXMs +IEluYy4xJDAiBgNVBAMMG1RydXN0QXNpYSBHbG9iYWwgUm9vdCBDQSBHNDAeFw0y +MTA1MjAwMjEwMjJaFw00NjA1MTkwMjEwMjJaMFoxCzAJBgNVBAYTAkNOMSUwIwYD +VQQKDBxUcnVzdEFzaWEgVGVjaG5vbG9naWVzLCBJbmMuMSQwIgYDVQQDDBtUcnVz +dEFzaWEgR2xvYmFsIFJvb3QgQ0EgRzQwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAATx +s8045CVD5d4ZCbuBeaIVXxVjAd7Cq92zphtnS4CDr5nLrBfbK5bKfFJV4hrhPVbw +LxYI+hW8m7tH5j/uqOFMjPXTNvk4XatwmkcN4oFBButJ+bAp3TPsUKV/eSm4IJij +YzBhMA8GA1UdEwEB/wQFMAMBAf8wHwYDVR0jBBgwFoAUpbtKl86zK3+kMd6Xg1mD +pm9xy94wHQYDVR0OBBYEFKW7SpfOsyt/pDHel4NZg6ZvccveMA4GA1UdDwEB/wQE +AwIBBjAKBggqhkjOPQQDAwNnADBkAjBe8usGzEkxn0AAbbd+NvBNEU/zy4k6LHiR +UKNbwMp1JvK/kF0LgoxgKJ/GcJpo5PECMFxYDlZ2z1jD1xCMuo6u47xkdUfFVZDj +/bpV6wfEU6s3qe4hsiFbYI89MvHVI5TWWA== +-----END CERTIFICATE----- # CN=Trustwave Global Certification Authority,O=Trustwave Holdings\, Inc.,L=Chicago,ST=Illinois,C=US # 97552015f5ddfc3c8788c006944555408894450084f100867086bc1a2bb58dc8 -----BEGIN CERTIFICATE----- From 4e5a26183ecb4f9a0f85c8f8dbe7982885435436 Mon Sep 17 00:00:00 2001 From: Edoardo Spadolini Date: Tue, 12 Dec 2023 13:04:53 +0000 Subject: [PATCH 24/95] ssh: close net.Conn on all NewServerConn errors This PR ensures that the net.Conn passed to ssh.NewServerConn is closed on all error return paths, not just after a failed handshake. This matches the behavior of ssh.NewClientConn. Change-Id: Id8a51d10ae8d575cbbe26f2ef6b37de7cca840ec GitHub-Last-Rev: 81bb2e58a881a9a85935740bda06b034b32a8ce3 GitHub-Pull-Request: golang/crypto#279 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/549095 Run-TryBot: Nicola Murino Auto-Submit: Nicola Murino Reviewed-by: Roland Shoemaker Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt TryBot-Result: Gopher Robot --- ssh/server.go | 2 ++ ssh/server_test.go | 73 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/ssh/server.go b/ssh/server.go index 7f0c236a9a..c2dfe3268c 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -213,6 +213,7 @@ func NewServerConn(c net.Conn, config *ServerConfig) (*ServerConn, <-chan NewCha } else { for _, algo := range fullConf.PublicKeyAuthAlgorithms { if !contains(supportedPubKeyAuthAlgos, algo) { + c.Close() return nil, nil, nil, fmt.Errorf("ssh: unsupported public key authentication algorithm %s", algo) } } @@ -220,6 +221,7 @@ func NewServerConn(c net.Conn, config *ServerConfig) (*ServerConn, <-chan NewCha // Check if the config contains any unsupported key exchanges for _, kex := range fullConf.KeyExchanges { if _, ok := serverForbiddenKexAlgos[kex]; ok { + c.Close() return nil, nil, nil, fmt.Errorf("ssh: unsupported key exchange %s for server", kex) } } diff --git a/ssh/server_test.go b/ssh/server_test.go index 2145dce06f..a9b2bce0c1 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -5,7 +5,11 @@ package ssh import ( + "io" + "net" + "sync/atomic" "testing" + "time" ) func TestClientAuthRestrictedPublicKeyAlgos(t *testing.T) { @@ -59,27 +63,78 @@ func TestClientAuthRestrictedPublicKeyAlgos(t *testing.T) { } func TestNewServerConnValidationErrors(t *testing.T) { - c1, c2, err := netPipe() - if err != nil { - t.Fatalf("netPipe: %v", err) - } - defer c1.Close() - defer c2.Close() - serverConf := &ServerConfig{ PublicKeyAuthAlgorithms: []string{CertAlgoRSAv01}, } - _, _, _, err = NewServerConn(c1, serverConf) + c := &markerConn{} + _, _, _, err := NewServerConn(c, serverConf) if err == nil { t.Fatal("NewServerConn with invalid public key auth algorithms succeeded") } + if !c.isClosed() { + t.Fatal("NewServerConn with invalid public key auth algorithms left connection open") + } + if c.isUsed() { + t.Fatal("NewServerConn with invalid public key auth algorithms used connection") + } + serverConf = &ServerConfig{ Config: Config{ KeyExchanges: []string{kexAlgoDHGEXSHA256}, }, } - _, _, _, err = NewServerConn(c1, serverConf) + c = &markerConn{} + _, _, _, err = NewServerConn(c, serverConf) if err == nil { t.Fatal("NewServerConn with unsupported key exchange succeeded") } + if !c.isClosed() { + t.Fatal("NewServerConn with unsupported key exchange left connection open") + } + if c.isUsed() { + t.Fatal("NewServerConn with unsupported key exchange used connection") + } +} + +type markerConn struct { + closed uint32 + used uint32 } + +func (c *markerConn) isClosed() bool { + return atomic.LoadUint32(&c.closed) != 0 +} + +func (c *markerConn) isUsed() bool { + return atomic.LoadUint32(&c.used) != 0 +} + +func (c *markerConn) Close() error { + atomic.StoreUint32(&c.closed, 1) + return nil +} + +func (c *markerConn) Read(b []byte) (n int, err error) { + atomic.StoreUint32(&c.used, 1) + if atomic.LoadUint32(&c.closed) != 0 { + return 0, net.ErrClosed + } else { + return 0, io.EOF + } +} + +func (c *markerConn) Write(b []byte) (n int, err error) { + atomic.StoreUint32(&c.used, 1) + if atomic.LoadUint32(&c.closed) != 0 { + return 0, net.ErrClosed + } else { + return 0, io.ErrClosedPipe + } +} + +func (*markerConn) LocalAddr() net.Addr { return nil } +func (*markerConn) RemoteAddr() net.Addr { return nil } + +func (*markerConn) SetDeadline(t time.Time) error { return nil } +func (*markerConn) SetReadDeadline(t time.Time) error { return nil } +func (*markerConn) SetWriteDeadline(t time.Time) error { return nil } From 9d2ee975ef9fe627bf0a6f01c1f69e8ef1d4f05d Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Mon, 20 Nov 2023 12:06:18 -0800 Subject: [PATCH 25/95] ssh: implement strict KEX protocol changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the "strict KEX" protocol changes, as described in section 1.9 of the OpenSSH PROTOCOL file (as of OpenSSH version 9.6/9.6p1). Namely this makes the following changes: * Both the server and the client add an additional algorithm to the initial KEXINIT message, indicating support for the strict KEX mode. * When one side of the connection sees the strict KEX extension algorithm, the strict KEX mode is enabled for messages originating from the other side of the connection. If the sequence number for the side which requested the extension is not 1 (indicating that it has already received non-KEXINIT packets), the connection is terminated. * When strict kex mode is enabled, unexpected messages during the handshake are considered fatal. Additionally when a key change occurs (on the receipt of the NEWKEYS message) the message sequence numbers are reset. Thanks to Fabian Bäumer, Marcus Brinkmann, and Jörg Schwenk from Ruhr University Bochum for reporting this issue. Fixes CVE-2023-48795 Fixes golang/go#64784 Change-Id: I96b53afd2bd2fb94d2b6f2a46a5dacf325357604 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/550715 Reviewed-by: Nicola Murino Reviewed-by: Tatiana Bradley TryBot-Result: Gopher Robot Run-TryBot: Roland Shoemaker Reviewed-by: Damien Neil LUCI-TryBot-Result: Go LUCI --- ssh/handshake.go | 56 +++++++- ssh/handshake_test.go | 309 ++++++++++++++++++++++++++++++++++++++++++ ssh/transport.go | 32 ++++- 3 files changed, 388 insertions(+), 9 deletions(-) diff --git a/ssh/handshake.go b/ssh/handshake.go index 49bbba7692..56cdc7c21c 100644 --- a/ssh/handshake.go +++ b/ssh/handshake.go @@ -35,6 +35,16 @@ type keyingTransport interface { // direction will be effected if a msgNewKeys message is sent // or received. prepareKeyChange(*algorithms, *kexResult) error + + // setStrictMode sets the strict KEX mode, notably triggering + // sequence number resets on sending or receiving msgNewKeys. + // If the sequence number is already > 1 when setStrictMode + // is called, an error is returned. + setStrictMode() error + + // setInitialKEXDone indicates to the transport that the initial key exchange + // was completed + setInitialKEXDone() } // handshakeTransport implements rekeying on top of a keyingTransport @@ -100,6 +110,10 @@ type handshakeTransport struct { // The session ID or nil if first kex did not complete yet. sessionID []byte + + // strictMode indicates if the other side of the handshake indicated + // that we should be following the strict KEX protocol restrictions. + strictMode bool } type pendingKex struct { @@ -209,7 +223,10 @@ func (t *handshakeTransport) readLoop() { close(t.incoming) break } - if p[0] == msgIgnore || p[0] == msgDebug { + // If this is the first kex, and strict KEX mode is enabled, + // we don't ignore any messages, as they may be used to manipulate + // the packet sequence numbers. + if !(t.sessionID == nil && t.strictMode) && (p[0] == msgIgnore || p[0] == msgDebug) { continue } t.incoming <- p @@ -441,6 +458,11 @@ func (t *handshakeTransport) readOnePacket(first bool) ([]byte, error) { return successPacket, nil } +const ( + kexStrictClient = "kex-strict-c-v00@openssh.com" + kexStrictServer = "kex-strict-s-v00@openssh.com" +) + // sendKexInit sends a key change message. func (t *handshakeTransport) sendKexInit() error { t.mu.Lock() @@ -454,7 +476,6 @@ func (t *handshakeTransport) sendKexInit() error { } msg := &kexInitMsg{ - KexAlgos: t.config.KeyExchanges, CiphersClientServer: t.config.Ciphers, CiphersServerClient: t.config.Ciphers, MACsClientServer: t.config.MACs, @@ -464,6 +485,13 @@ func (t *handshakeTransport) sendKexInit() error { } io.ReadFull(rand.Reader, msg.Cookie[:]) + // We mutate the KexAlgos slice, in order to add the kex-strict extension algorithm, + // and possibly to add the ext-info extension algorithm. Since the slice may be the + // user owned KeyExchanges, we create our own slice in order to avoid using user + // owned memory by mistake. + msg.KexAlgos = make([]string, 0, len(t.config.KeyExchanges)+2) // room for kex-strict and ext-info + msg.KexAlgos = append(msg.KexAlgos, t.config.KeyExchanges...) + isServer := len(t.hostKeys) > 0 if isServer { for _, k := range t.hostKeys { @@ -488,17 +516,24 @@ func (t *handshakeTransport) sendKexInit() error { msg.ServerHostKeyAlgos = append(msg.ServerHostKeyAlgos, keyFormat) } } + + if t.sessionID == nil { + msg.KexAlgos = append(msg.KexAlgos, kexStrictServer) + } } else { msg.ServerHostKeyAlgos = t.hostKeyAlgorithms // As a client we opt in to receiving SSH_MSG_EXT_INFO so we know what // algorithms the server supports for public key authentication. See RFC // 8308, Section 2.1. + // + // We also send the strict KEX mode extension algorithm, in order to opt + // into the strict KEX mode. if firstKeyExchange := t.sessionID == nil; firstKeyExchange { - msg.KexAlgos = make([]string, 0, len(t.config.KeyExchanges)+1) - msg.KexAlgos = append(msg.KexAlgos, t.config.KeyExchanges...) msg.KexAlgos = append(msg.KexAlgos, "ext-info-c") + msg.KexAlgos = append(msg.KexAlgos, kexStrictClient) } + } packet := Marshal(msg) @@ -604,6 +639,13 @@ func (t *handshakeTransport) enterKeyExchange(otherInitPacket []byte) error { return err } + if t.sessionID == nil && ((isClient && contains(serverInit.KexAlgos, kexStrictServer)) || (!isClient && contains(clientInit.KexAlgos, kexStrictClient))) { + t.strictMode = true + if err := t.conn.setStrictMode(); err != nil { + return err + } + } + // We don't send FirstKexFollows, but we handle receiving it. // // RFC 4253 section 7 defines the kex and the agreement method for @@ -679,6 +721,12 @@ func (t *handshakeTransport) enterKeyExchange(otherInitPacket []byte) error { return unexpectedMessageError(msgNewKeys, packet[0]) } + if firstKeyExchange { + // Indicates to the transport that the first key exchange is completed + // after receiving SSH_MSG_NEWKEYS. + t.conn.setInitialKEXDone() + } + return nil } diff --git a/ssh/handshake_test.go b/ssh/handshake_test.go index 65afc2059e..2bc607b649 100644 --- a/ssh/handshake_test.go +++ b/ssh/handshake_test.go @@ -395,6 +395,10 @@ func (n *errorKeyingTransport) readPacket() ([]byte, error) { return n.packetConn.readPacket() } +func (n *errorKeyingTransport) setStrictMode() error { return nil } + +func (n *errorKeyingTransport) setInitialKEXDone() {} + func TestHandshakeErrorHandlingRead(t *testing.T) { for i := 0; i < 20; i++ { testHandshakeErrorHandlingN(t, i, -1, false) @@ -710,3 +714,308 @@ func TestPickIncompatibleHostKeyAlgo(t *testing.T) { t.Fatal("incompatible signer returned") } } + +func TestStrictKEXResetSeqFirstKEX(t *testing.T) { + if runtime.GOOS == "plan9" { + t.Skip("see golang.org/issue/7237") + } + + checker := &syncChecker{ + waitCall: make(chan int, 10), + called: make(chan int, 10), + } + + checker.waitCall <- 1 + trC, trS, err := handshakePair(&ClientConfig{HostKeyCallback: checker.Check}, "addr", false) + if err != nil { + t.Fatalf("handshakePair: %v", err) + } + <-checker.called + + t.Cleanup(func() { + trC.Close() + trS.Close() + }) + + // Throw away the msgExtInfo packet sent during the handshake by the server + _, err = trC.readPacket() + if err != nil { + t.Fatalf("readPacket failed: %s", err) + } + + // close the handshake transports before checking the sequence number to + // avoid races. + trC.Close() + trS.Close() + + // check that the sequence number counters. We reset after msgNewKeys, but + // then the server immediately writes msgExtInfo, and we close the + // transports so we expect read 2, write 0 on the client and read 1, write 1 + // on the server. + if trC.conn.(*transport).reader.seqNum != 2 || trC.conn.(*transport).writer.seqNum != 0 || + trS.conn.(*transport).reader.seqNum != 1 || trS.conn.(*transport).writer.seqNum != 1 { + t.Errorf( + "unexpected sequence counters:\nclient: reader %d (expected 2), writer %d (expected 0)\nserver: reader %d (expected 1), writer %d (expected 1)", + trC.conn.(*transport).reader.seqNum, + trC.conn.(*transport).writer.seqNum, + trS.conn.(*transport).reader.seqNum, + trS.conn.(*transport).writer.seqNum, + ) + } +} + +func TestStrictKEXResetSeqSuccessiveKEX(t *testing.T) { + if runtime.GOOS == "plan9" { + t.Skip("see golang.org/issue/7237") + } + + checker := &syncChecker{ + waitCall: make(chan int, 10), + called: make(chan int, 10), + } + + checker.waitCall <- 1 + trC, trS, err := handshakePair(&ClientConfig{HostKeyCallback: checker.Check}, "addr", false) + if err != nil { + t.Fatalf("handshakePair: %v", err) + } + <-checker.called + + t.Cleanup(func() { + trC.Close() + trS.Close() + }) + + // Throw away the msgExtInfo packet sent during the handshake by the server + _, err = trC.readPacket() + if err != nil { + t.Fatalf("readPacket failed: %s", err) + } + + // write and read five packets on either side to bump the sequence numbers + for i := 0; i < 5; i++ { + if err := trC.writePacket([]byte{msgRequestSuccess}); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + if _, err := trS.readPacket(); err != nil { + t.Fatalf("readPacket failed: %s", err) + } + if err := trS.writePacket([]byte{msgRequestSuccess}); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + if _, err := trC.readPacket(); err != nil { + t.Fatalf("readPacket failed: %s", err) + } + } + + // Request a key exchange, which should cause the sequence numbers to reset + checker.waitCall <- 1 + trC.requestKeyExchange() + <-checker.called + + // write a packet on the client, and then read it, to verify the key change has actually happened, since + // the HostKeyCallback is called _during_ the handshake, so isn't actually indicative of the handshake + // finishing. + dummyPacket := []byte{99} + if err := trS.writePacket(dummyPacket); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + if p, err := trC.readPacket(); err != nil { + t.Fatalf("readPacket failed: %s", err) + } else if !bytes.Equal(p, dummyPacket) { + t.Fatalf("unexpected packet: got %x, want %x", p, dummyPacket) + } + + // close the handshake transports before checking the sequence number to + // avoid races. + trC.Close() + trS.Close() + + if trC.conn.(*transport).reader.seqNum != 2 || trC.conn.(*transport).writer.seqNum != 0 || + trS.conn.(*transport).reader.seqNum != 1 || trS.conn.(*transport).writer.seqNum != 1 { + t.Errorf( + "unexpected sequence counters:\nclient: reader %d (expected 2), writer %d (expected 0)\nserver: reader %d (expected 1), writer %d (expected 1)", + trC.conn.(*transport).reader.seqNum, + trC.conn.(*transport).writer.seqNum, + trS.conn.(*transport).reader.seqNum, + trS.conn.(*transport).writer.seqNum, + ) + } +} + +func TestSeqNumIncrease(t *testing.T) { + if runtime.GOOS == "plan9" { + t.Skip("see golang.org/issue/7237") + } + + checker := &syncChecker{ + waitCall: make(chan int, 10), + called: make(chan int, 10), + } + + checker.waitCall <- 1 + trC, trS, err := handshakePair(&ClientConfig{HostKeyCallback: checker.Check}, "addr", false) + if err != nil { + t.Fatalf("handshakePair: %v", err) + } + <-checker.called + + t.Cleanup(func() { + trC.Close() + trS.Close() + }) + + // Throw away the msgExtInfo packet sent during the handshake by the server + _, err = trC.readPacket() + if err != nil { + t.Fatalf("readPacket failed: %s", err) + } + + // write and read five packets on either side to bump the sequence numbers + for i := 0; i < 5; i++ { + if err := trC.writePacket([]byte{msgRequestSuccess}); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + if _, err := trS.readPacket(); err != nil { + t.Fatalf("readPacket failed: %s", err) + } + if err := trS.writePacket([]byte{msgRequestSuccess}); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + if _, err := trC.readPacket(); err != nil { + t.Fatalf("readPacket failed: %s", err) + } + } + + // close the handshake transports before checking the sequence number to + // avoid races. + trC.Close() + trS.Close() + + if trC.conn.(*transport).reader.seqNum != 7 || trC.conn.(*transport).writer.seqNum != 5 || + trS.conn.(*transport).reader.seqNum != 6 || trS.conn.(*transport).writer.seqNum != 6 { + t.Errorf( + "unexpected sequence counters:\nclient: reader %d (expected 7), writer %d (expected 5)\nserver: reader %d (expected 6), writer %d (expected 6)", + trC.conn.(*transport).reader.seqNum, + trC.conn.(*transport).writer.seqNum, + trS.conn.(*transport).reader.seqNum, + trS.conn.(*transport).writer.seqNum, + ) + } +} + +func TestStrictKEXUnexpectedMsg(t *testing.T) { + if runtime.GOOS == "plan9" { + t.Skip("see golang.org/issue/7237") + } + + // Check that unexpected messages during the handshake cause failure + _, _, err := handshakePair(&ClientConfig{HostKeyCallback: func(hostname string, remote net.Addr, key PublicKey) error { return nil }}, "addr", true) + if err == nil { + t.Fatal("handshake should fail when there are unexpected messages during the handshake") + } + + trC, trS, err := handshakePair(&ClientConfig{HostKeyCallback: func(hostname string, remote net.Addr, key PublicKey) error { return nil }}, "addr", false) + if err != nil { + t.Fatalf("handshake failed: %s", err) + } + + // Check that ignore/debug pacekts are still ignored outside of the handshake + if err := trC.writePacket([]byte{msgIgnore}); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + if err := trC.writePacket([]byte{msgDebug}); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + dummyPacket := []byte{99} + if err := trC.writePacket(dummyPacket); err != nil { + t.Fatalf("writePacket failed: %s", err) + } + + if p, err := trS.readPacket(); err != nil { + t.Fatalf("readPacket failed: %s", err) + } else if !bytes.Equal(p, dummyPacket) { + t.Fatalf("unexpected packet: got %x, want %x", p, dummyPacket) + } +} + +func TestStrictKEXMixed(t *testing.T) { + // Test that we still support a mixed connection, where one side sends kex-strict but the other + // side doesn't. + + a, b, err := netPipe() + if err != nil { + t.Fatalf("netPipe failed: %s", err) + } + + var trC, trS keyingTransport + + trC = newTransport(a, rand.Reader, true) + trS = newTransport(b, rand.Reader, false) + trS = addNoiseTransport(trS) + + clientConf := &ClientConfig{HostKeyCallback: func(hostname string, remote net.Addr, key PublicKey) error { return nil }} + clientConf.SetDefaults() + + v := []byte("version") + client := newClientTransport(trC, v, v, clientConf, "addr", a.RemoteAddr()) + + serverConf := &ServerConfig{} + serverConf.AddHostKey(testSigners["ecdsa"]) + serverConf.AddHostKey(testSigners["rsa"]) + serverConf.SetDefaults() + + transport := newHandshakeTransport(trS, &serverConf.Config, []byte("version"), []byte("version")) + transport.hostKeys = serverConf.hostKeys + transport.publicKeyAuthAlgorithms = serverConf.PublicKeyAuthAlgorithms + + readOneFailure := make(chan error, 1) + go func() { + if _, err := transport.readOnePacket(true); err != nil { + readOneFailure <- err + } + }() + + // Basically sendKexInit, but without the kex-strict extension algorithm + msg := &kexInitMsg{ + KexAlgos: transport.config.KeyExchanges, + CiphersClientServer: transport.config.Ciphers, + CiphersServerClient: transport.config.Ciphers, + MACsClientServer: transport.config.MACs, + MACsServerClient: transport.config.MACs, + CompressionClientServer: supportedCompressions, + CompressionServerClient: supportedCompressions, + ServerHostKeyAlgos: []string{KeyAlgoRSASHA256, KeyAlgoRSASHA512, KeyAlgoRSA}, + } + packet := Marshal(msg) + // writePacket destroys the contents, so save a copy. + packetCopy := make([]byte, len(packet)) + copy(packetCopy, packet) + if err := transport.pushPacket(packetCopy); err != nil { + t.Fatalf("pushPacket: %s", err) + } + transport.sentInitMsg = msg + transport.sentInitPacket = packet + + if err := transport.getWriteError(); err != nil { + t.Fatalf("getWriteError failed: %s", err) + } + var request *pendingKex + select { + case err = <-readOneFailure: + t.Fatalf("server readOnePacket failed: %s", err) + case request = <-transport.startKex: + break + } + + // We expect the following calls to fail if the side which does not support + // kex-strict sends unexpected/ignored packets during the handshake, even if + // the other side does support kex-strict. + + if err := transport.enterKeyExchange(request.otherInit); err != nil { + t.Fatalf("enterKeyExchange failed: %s", err) + } + if err := client.waitSession(); err != nil { + t.Fatalf("client.waitSession: %v", err) + } +} diff --git a/ssh/transport.go b/ssh/transport.go index da015801ea..0424d2d37c 100644 --- a/ssh/transport.go +++ b/ssh/transport.go @@ -49,6 +49,9 @@ type transport struct { rand io.Reader isClient bool io.Closer + + strictMode bool + initialKEXDone bool } // packetCipher represents a combination of SSH encryption/MAC @@ -74,6 +77,18 @@ type connectionState struct { pendingKeyChange chan packetCipher } +func (t *transport) setStrictMode() error { + if t.reader.seqNum != 1 { + return errors.New("ssh: sequence number != 1 when strict KEX mode requested") + } + t.strictMode = true + return nil +} + +func (t *transport) setInitialKEXDone() { + t.initialKEXDone = true +} + // prepareKeyChange sets up key material for a keychange. The key changes in // both directions are triggered by reading and writing a msgNewKey packet // respectively. @@ -112,11 +127,12 @@ func (t *transport) printPacket(p []byte, write bool) { // Read and decrypt next packet. func (t *transport) readPacket() (p []byte, err error) { for { - p, err = t.reader.readPacket(t.bufReader) + p, err = t.reader.readPacket(t.bufReader, t.strictMode) if err != nil { break } - if len(p) == 0 || (p[0] != msgIgnore && p[0] != msgDebug) { + // in strict mode we pass through DEBUG and IGNORE packets only during the initial KEX + if len(p) == 0 || (t.strictMode && !t.initialKEXDone) || (p[0] != msgIgnore && p[0] != msgDebug) { break } } @@ -127,7 +143,7 @@ func (t *transport) readPacket() (p []byte, err error) { return p, err } -func (s *connectionState) readPacket(r *bufio.Reader) ([]byte, error) { +func (s *connectionState) readPacket(r *bufio.Reader, strictMode bool) ([]byte, error) { packet, err := s.packetCipher.readCipherPacket(s.seqNum, r) s.seqNum++ if err == nil && len(packet) == 0 { @@ -140,6 +156,9 @@ func (s *connectionState) readPacket(r *bufio.Reader) ([]byte, error) { select { case cipher := <-s.pendingKeyChange: s.packetCipher = cipher + if strictMode { + s.seqNum = 0 + } default: return nil, errors.New("ssh: got bogus newkeys message") } @@ -170,10 +189,10 @@ func (t *transport) writePacket(packet []byte) error { if debugTransport { t.printPacket(packet, true) } - return t.writer.writePacket(t.bufWriter, t.rand, packet) + return t.writer.writePacket(t.bufWriter, t.rand, packet, t.strictMode) } -func (s *connectionState) writePacket(w *bufio.Writer, rand io.Reader, packet []byte) error { +func (s *connectionState) writePacket(w *bufio.Writer, rand io.Reader, packet []byte, strictMode bool) error { changeKeys := len(packet) > 0 && packet[0] == msgNewKeys err := s.packetCipher.writeCipherPacket(s.seqNum, w, rand, packet) @@ -188,6 +207,9 @@ func (s *connectionState) writePacket(w *bufio.Writer, rand io.Reader, packet [] select { case cipher := <-s.pendingKeyChange: s.packetCipher = cipher + if strictMode { + s.seqNum = 0 + } default: panic("ssh: no key material for msgNewKeys") } From 08396bb92b82dea0967fda0bc947baa1ae721de4 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 27 Nov 2023 18:41:10 +0000 Subject: [PATCH 26/95] internal/poly1305: drop Go 1.12 compatibility Other packages already dropped compatibility with go1.12, so it should be safe to remove it for this package as well. Change-Id: Ieecc7cd06a0a4e69e8c1c09ef6fefe95d78ceb75 GitHub-Last-Rev: 1971e0309bdd31facbc60d70cd36a91f6a22f4b2 GitHub-Pull-Request: golang/crypto#239 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/448239 Reviewed-by: Joedian Reid Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker Reviewed-by: Matthew Dempsky --- internal/poly1305/bits_compat.go | 39 ----------------------------- internal/poly1305/bits_go1.13.go | 21 ---------------- internal/poly1305/sum_generic.go | 43 +++++++++++++++++--------------- 3 files changed, 23 insertions(+), 80 deletions(-) delete mode 100644 internal/poly1305/bits_compat.go delete mode 100644 internal/poly1305/bits_go1.13.go diff --git a/internal/poly1305/bits_compat.go b/internal/poly1305/bits_compat.go deleted file mode 100644 index d33c8890fc..0000000000 --- a/internal/poly1305/bits_compat.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.13 - -package poly1305 - -// Generic fallbacks for the math/bits intrinsics, copied from -// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had -// variable time fallbacks until Go 1.13. - -func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { - sum = x + y + carry - carryOut = ((x & y) | ((x | y) &^ sum)) >> 63 - return -} - -func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { - diff = x - y - borrow - borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63 - return -} - -func bitsMul64(x, y uint64) (hi, lo uint64) { - const mask32 = 1<<32 - 1 - x0 := x & mask32 - x1 := x >> 32 - y0 := y & mask32 - y1 := y >> 32 - w0 := x0 * y0 - t := x1*y0 + w0>>32 - w1 := t & mask32 - w2 := t >> 32 - w1 += x0 * y1 - hi = x1*y1 + w2 + w1>>32 - lo = x * y - return -} diff --git a/internal/poly1305/bits_go1.13.go b/internal/poly1305/bits_go1.13.go deleted file mode 100644 index 495c1fa697..0000000000 --- a/internal/poly1305/bits_go1.13.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.13 - -package poly1305 - -import "math/bits" - -func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { - return bits.Add64(x, y, carry) -} - -func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { - return bits.Sub64(x, y, borrow) -} - -func bitsMul64(x, y uint64) (hi, lo uint64) { - return bits.Mul64(x, y) -} diff --git a/internal/poly1305/sum_generic.go b/internal/poly1305/sum_generic.go index e041da5ea3..ec2202bd7d 100644 --- a/internal/poly1305/sum_generic.go +++ b/internal/poly1305/sum_generic.go @@ -7,7 +7,10 @@ package poly1305 -import "encoding/binary" +import ( + "encoding/binary" + "math/bits" +) // Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag // for a 64 bytes message is approximately @@ -114,13 +117,13 @@ type uint128 struct { } func mul64(a, b uint64) uint128 { - hi, lo := bitsMul64(a, b) + hi, lo := bits.Mul64(a, b) return uint128{lo, hi} } func add128(a, b uint128) uint128 { - lo, c := bitsAdd64(a.lo, b.lo, 0) - hi, c := bitsAdd64(a.hi, b.hi, c) + lo, c := bits.Add64(a.lo, b.lo, 0) + hi, c := bits.Add64(a.hi, b.hi, c) if c != 0 { panic("poly1305: unexpected overflow") } @@ -155,8 +158,8 @@ func updateGeneric(state *macState, msg []byte) { // hide leading zeroes. For full chunks, that's 1 << 128, so we can just // add 1 to the most significant (2¹²⁸) limb, h2. if len(msg) >= TagSize { - h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0) - h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c) + h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0) + h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(msg[8:16]), c) h2 += c + 1 msg = msg[TagSize:] @@ -165,8 +168,8 @@ func updateGeneric(state *macState, msg []byte) { copy(buf[:], msg) buf[len(msg)] = 1 - h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0) - h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c) + h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0) + h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(buf[8:16]), c) h2 += c msg = nil @@ -219,9 +222,9 @@ func updateGeneric(state *macState, msg []byte) { m3 := h2r1 t0 := m0.lo - t1, c := bitsAdd64(m1.lo, m0.hi, 0) - t2, c := bitsAdd64(m2.lo, m1.hi, c) - t3, _ := bitsAdd64(m3.lo, m2.hi, c) + t1, c := bits.Add64(m1.lo, m0.hi, 0) + t2, c := bits.Add64(m2.lo, m1.hi, c) + t3, _ := bits.Add64(m3.lo, m2.hi, c) // Now we have the result as 4 64-bit limbs, and we need to reduce it // modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do @@ -243,14 +246,14 @@ func updateGeneric(state *macState, msg []byte) { // To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c. - h0, c = bitsAdd64(h0, cc.lo, 0) - h1, c = bitsAdd64(h1, cc.hi, c) + h0, c = bits.Add64(h0, cc.lo, 0) + h1, c = bits.Add64(h1, cc.hi, c) h2 += c cc = shiftRightBy2(cc) - h0, c = bitsAdd64(h0, cc.lo, 0) - h1, c = bitsAdd64(h1, cc.hi, c) + h0, c = bits.Add64(h0, cc.lo, 0) + h1, c = bits.Add64(h1, cc.hi, c) h2 += c // h2 is at most 3 + 1 + 1 = 5, making the whole of h at most @@ -287,9 +290,9 @@ func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) { // in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the // result if the subtraction underflows, and t otherwise. - hMinusP0, b := bitsSub64(h0, p0, 0) - hMinusP1, b := bitsSub64(h1, p1, b) - _, b = bitsSub64(h2, p2, b) + hMinusP0, b := bits.Sub64(h0, p0, 0) + hMinusP1, b := bits.Sub64(h1, p1, b) + _, b = bits.Sub64(h2, p2, b) // h = h if h < p else h - p h0 = select64(b, h0, hMinusP0) @@ -301,8 +304,8 @@ func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) { // // by just doing a wide addition with the 128 low bits of h and discarding // the overflow. - h0, c := bitsAdd64(h0, s[0], 0) - h1, _ = bitsAdd64(h1, s[1], c) + h0, c := bits.Add64(h0, s[0], 0) + h1, _ = bits.Add64(h1, s[1], c) binary.LittleEndian.PutUint64(out[0:8], h0) binary.LittleEndian.PutUint64(out[8:16], h1) From 055043dfed6c4d9fcec3323f6c5e67b753f171e1 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Thu, 4 Jan 2024 15:17:54 +0000 Subject: [PATCH 27/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I2df91d0602cd1ef94370de4a60a935edeb441333 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/553996 Reviewed-by: Than McIntosh LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Gopher Robot Reviewed-by: Dmitri Shuralyov --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index d676a454ad..18cb00a76c 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.10.0 // tagx:ignore - golang.org/x/sys v0.15.0 - golang.org/x/term v0.15.0 + golang.org/x/sys v0.16.0 + golang.org/x/term v0.16.0 ) require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index f9352ee97e..d89d678c13 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= -golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= +golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 403f69900ed17a8c7dace8740fa0bed65ad19bbf Mon Sep 17 00:00:00 2001 From: "Bryan C. Mills" Date: Fri, 5 Jan 2024 15:08:10 -0500 Subject: [PATCH 28/95] ssh/test: avoid leaking a net.UnixConn in server.TryDialWithAddr For golang/go#64959. Change-Id: I2153166f4960058cdc2b82ae34ca250dcc6ba1c6 Cq-Include-Trybots: luci.golang.try:x_crypto-gotip-linux-amd64-longtest,x_crypto-gotip-windows-amd64-longtest Reviewed-on: https://go-review.googlesource.com/c/crypto/+/554062 Run-TryBot: Bryan Mills Auto-Submit: Bryan Mills Reviewed-by: Dmitri Shuralyov TryBot-Result: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- ssh/test/test_unix_test.go | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/ssh/test/test_unix_test.go b/ssh/test/test_unix_test.go index 8dbedb0b13..9695de7319 100644 --- a/ssh/test/test_unix_test.go +++ b/ssh/test/test_unix_test.go @@ -178,7 +178,7 @@ func (s *server) TryDial(config *ssh.ClientConfig) (*ssh.Client, error) { // addr is the user specified host:port. While we don't actually dial it, // we need to know this for host key matching -func (s *server) TryDialWithAddr(config *ssh.ClientConfig, addr string) (*ssh.Client, error) { +func (s *server) TryDialWithAddr(config *ssh.ClientConfig, addr string) (client *ssh.Client, err error) { sshd, err := exec.LookPath("sshd") if err != nil { s.t.Skipf("skipping test: %v", err) @@ -188,13 +188,26 @@ func (s *server) TryDialWithAddr(config *ssh.ClientConfig, addr string) (*ssh.Cl if err != nil { s.t.Fatalf("unixConnection: %v", err) } + defer func() { + // Close c2 after we've started the sshd command so that it won't prevent c1 + // from returning EOF when the sshd command exits. + c2.Close() + + // Leave c1 open if we're returning a client that wraps it. + // (The client is responsible for closing it.) + // Otherwise, close it to free up the socket. + if client == nil { + c1.Close() + } + }() - cmd := testenv.Command(s.t, sshd, "-f", s.configfile, "-i", "-e") f, err := c2.File() if err != nil { s.t.Fatalf("UnixConn.File: %v", err) } defer f.Close() + + cmd := testenv.Command(s.t, sshd, "-f", s.configfile, "-i", "-e") cmd.Stdin = f cmd.Stdout = f cmd.Stderr = new(bytes.Buffer) @@ -223,7 +236,7 @@ func (s *server) TryDialWithAddr(config *ssh.ClientConfig, addr string) (*ssh.Cl // processes are killed too. cmd.Process.Signal(os.Interrupt) cmd.Wait() - if s.t.Failed() { + if s.t.Failed() || testing.Verbose() { // log any output from sshd process s.t.Logf("sshd:\n%s", cmd.Stderr) } From dbb6ec16ecef7a66638d8514be54b13660551b0a Mon Sep 17 00:00:00 2001 From: "Bryan C. Mills" Date: Fri, 5 Jan 2024 18:21:50 -0500 Subject: [PATCH 29/95] ssh/test: skip tests on darwin that fail on the darwin-amd64-longtest LUCI builder We don't yet understand why these tests fail, but the Apple sshd seems to have some non-trivial vendor patches, so it is plausibly a platform-specific bug in the test. Let's skip that failure mode on the whole platform until/unless someone has time to reproduce and investigate the failure. For golang/go#64959. Cq-Include-Trybots: luci.golang.try:x_crypto-gotip-darwin-amd64-longtest,x_crypto-gotip-linux-amd64-longtest,x_crypto-gotip-windows-amd64-longtest Change-Id: I9e43579469de3fe9329c093b5916bbed0edd3751 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/554077 Reviewed-by: Dmitri Shuralyov Reviewed-by: Dmitri Shuralyov Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI Auto-Submit: Bryan Mills --- ssh/test/agent_unix_test.go | 1 + ssh/test/dial_unix_test.go | 1 + ssh/test/forward_unix_test.go | 10 ++++++++++ ssh/test/session_test.go | 17 +++++++++++++++++ 4 files changed, 29 insertions(+) diff --git a/ssh/test/agent_unix_test.go b/ssh/test/agent_unix_test.go index aeecaebb1d..a9c4893f7d 100644 --- a/ssh/test/agent_unix_test.go +++ b/ssh/test/agent_unix_test.go @@ -34,6 +34,7 @@ func TestAgentForward(t *testing.T) { sess, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("NewSession: %v", err) } if err := agent.RequestAgentForwarding(sess); err != nil { diff --git a/ssh/test/dial_unix_test.go b/ssh/test/dial_unix_test.go index 8ec8d50a50..7d6fb2877f 100644 --- a/ssh/test/dial_unix_test.go +++ b/ssh/test/dial_unix_test.go @@ -53,6 +53,7 @@ func testDial(t *testing.T, n, listenAddr string, x dialTester) { // on the opened connection. cancel() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("Dial: %v", err) } x.TestClientConn(t, conn) diff --git a/ssh/test/forward_unix_test.go b/ssh/test/forward_unix_test.go index 611122b9e8..e32d732e9f 100644 --- a/ssh/test/forward_unix_test.go +++ b/ssh/test/forward_unix_test.go @@ -12,6 +12,7 @@ import ( "io" "math/rand" "net" + "runtime" "testing" "time" ) @@ -27,6 +28,9 @@ func testPortForward(t *testing.T, n, listenAddr string) { sshListener, err := conn.Listen(n, listenAddr) if err != nil { + if runtime.GOOS == "darwin" && err == io.EOF { + t.Skipf("skipping test broken on some versions of macOS; see https://go.dev/issue/64959") + } t.Fatal(err) } @@ -122,6 +126,9 @@ func testAcceptClose(t *testing.T, n, listenAddr string) { sshListener, err := conn.Listen(n, listenAddr) if err != nil { + if runtime.GOOS == "darwin" && err == io.EOF { + t.Skipf("skipping test broken on some versions of macOS; see https://go.dev/issue/64959") + } t.Fatal(err) } @@ -163,6 +170,9 @@ func testPortForwardConnectionClose(t *testing.T, n, listenAddr string) { sshListener, err := client.Listen(n, listenAddr) if err != nil { + if runtime.GOOS == "darwin" && err == io.EOF { + t.Skipf("skipping test broken on some versions of macOS; see https://go.dev/issue/64959") + } t.Fatal(err) } diff --git a/ssh/test/session_test.go b/ssh/test/session_test.go index 69c8b86dd5..c128b2ae51 100644 --- a/ssh/test/session_test.go +++ b/ssh/test/session_test.go @@ -22,6 +22,13 @@ import ( "golang.org/x/crypto/ssh" ) +func skipIfIssue64959(t *testing.T, err error) { + if err != nil && runtime.GOOS == "darwin" && strings.Contains(err.Error(), "ssh: unexpected packet in response to channel open: ") { + t.Helper() + t.Skipf("skipping test broken on some versions of macOS; see https://go.dev/issue/64959") + } +} + func TestRunCommandSuccess(t *testing.T) { server := newServer(t) conn := server.Dial(clientConfig()) @@ -29,6 +36,7 @@ func TestRunCommandSuccess(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("session failed: %v", err) } defer session.Close() @@ -66,6 +74,7 @@ func TestRunCommandStdin(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("session failed: %v", err) } defer session.Close() @@ -88,6 +97,7 @@ func TestRunCommandStdinError(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("session failed: %v", err) } defer session.Close() @@ -111,6 +121,7 @@ func TestRunCommandFailed(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("session failed: %v", err) } defer session.Close() @@ -127,6 +138,7 @@ func TestRunCommandWeClosed(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("session failed: %v", err) } err = session.Shell() @@ -146,6 +158,7 @@ func TestFuncLargeRead(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("unable to create new session: %s", err) } @@ -182,6 +195,7 @@ func TestKeyChange(t *testing.T) { for i := 0; i < 4; i++ { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("unable to create new session: %s", err) } @@ -223,6 +237,7 @@ func TestValidTerminalMode(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("session failed: %v", err) } defer session.Close() @@ -287,6 +302,7 @@ func TestWindowChange(t *testing.T) { session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("session failed: %v", err) } defer session.Close() @@ -349,6 +365,7 @@ func testOneCipher(t *testing.T, cipher string, cipherOrder []string) { // Exercise receiving data from the server session, err := conn.NewSession() if err != nil { + skipIfIssue64959(t, err) t.Fatalf("NewSession: %v", err) } From 913d3ae7415ace5edad6750b7c9192ac7518e9df Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 5 Feb 2024 16:01:22 +0000 Subject: [PATCH 30/95] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: I6d9163026799e5d134f6bb6819e22448d7ebe719 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/561395 Reviewed-by: Roland Shoemaker Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- x509roots/fallback/bundle.go | 74 +++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index ad9ca4db1e..a666f5f742 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -3162,6 +3162,58 @@ lhh9DrUUOYTxKOkto557HnpyWoOzeW/vtPzQCqVYT0bf+215WfKEIlKuD8z7fDvn aspHYcN6+NOSBB+4IIThNlQWx0DeO4pz3N/GCUzf7Nr/1FNCocnyYh0igzyXxfkZ YiesZSLX0zzG5Y6yU8xJzrww/nsOM5D77dIUkR8Hrw== -----END CERTIFICATE----- +# CN=Telekom Security TLS ECC Root 2020,O=Deutsche Telekom Security GmbH,C=DE +# 578af4ded0853f4e5998db4aeaf9cbea8d945f60b620a38d1a3c13b2bc7ba8e1 +-----BEGIN CERTIFICATE----- +MIICQjCCAcmgAwIBAgIQNjqWjMlcsljN0AFdxeVXADAKBggqhkjOPQQDAzBjMQsw +CQYDVQQGEwJERTEnMCUGA1UECgweRGV1dHNjaGUgVGVsZWtvbSBTZWN1cml0eSBH +bWJIMSswKQYDVQQDDCJUZWxla29tIFNlY3VyaXR5IFRMUyBFQ0MgUm9vdCAyMDIw +MB4XDTIwMDgyNTA3NDgyMFoXDTQ1MDgyNTIzNTk1OVowYzELMAkGA1UEBhMCREUx +JzAlBgNVBAoMHkRldXRzY2hlIFRlbGVrb20gU2VjdXJpdHkgR21iSDErMCkGA1UE +AwwiVGVsZWtvbSBTZWN1cml0eSBUTFMgRUNDIFJvb3QgMjAyMDB2MBAGByqGSM49 +AgEGBSuBBAAiA2IABM6//leov9Wq9xCazbzREaK9Z0LMkOsVGJDZos0MKiXrPk/O +tdKPD/M12kOLAoC+b1EkHQ9rK8qfwm9QMuU3ILYg/4gND21Ju9sGpIeQkpT0CdDP +f8iAC8GXs7s1J8nCG6NCMEAwHQYDVR0OBBYEFONyzG6VmUex5rNhTNHLq+O6zd6f +MA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cA +MGQCMHVSi7ekEE+uShCLsoRbQuHmKjYC2qBuGT8lv9pZMo7k+5Dck2TOrbRBR2Di +z6fLHgIwN0GMZt9Ba9aDAEH9L1r3ULRn0SyocddDypwnJJGDSA3PzfdUga/sf+Rn +27iQ7t0l +-----END CERTIFICATE----- +# CN=Telekom Security TLS RSA Root 2023,O=Deutsche Telekom Security GmbH,C=DE +# efc65cadbb59adb6efe84da22311b35624b71b3b1ea0da8b6655174ec8978646 +-----BEGIN CERTIFICATE----- +MIIFszCCA5ugAwIBAgIQIZxULej27HF3+k7ow3BXlzANBgkqhkiG9w0BAQwFADBj +MQswCQYDVQQGEwJERTEnMCUGA1UECgweRGV1dHNjaGUgVGVsZWtvbSBTZWN1cml0 +eSBHbWJIMSswKQYDVQQDDCJUZWxla29tIFNlY3VyaXR5IFRMUyBSU0EgUm9vdCAy +MDIzMB4XDTIzMDMyODEyMTY0NVoXDTQ4MDMyNzIzNTk1OVowYzELMAkGA1UEBhMC +REUxJzAlBgNVBAoMHkRldXRzY2hlIFRlbGVrb20gU2VjdXJpdHkgR21iSDErMCkG +A1UEAwwiVGVsZWtvbSBTZWN1cml0eSBUTFMgUlNBIFJvb3QgMjAyMzCCAiIwDQYJ +KoZIhvcNAQEBBQADggIPADCCAgoCggIBAO01oYGA88tKaVvC+1GDrib94W7zgRJ9 +cUD/h3VCKSHtgVIs3xLBGYSJwb3FKNXVS2xE1kzbB5ZKVXrKNoIENqil/Cf2SfHV +cp6R+SPWcHu79ZvB7JPPGeplfohwoHP89v+1VmLhc2o0mD6CuKyVU/QBoCcHcqMA +U6DksquDOFczJZSfvkgdmOGjup5czQRxUX11eKvzWarE4GC+j4NSuHUaQTXtvPM6 +Y+mpFEXX5lLRbtLevOP1Czvm4MS9Q2QTps70mDdsipWol8hHD/BeEIvnHRz+sTug +BTNoBUGCwQMrAcjnj02r6LX2zWtEtefdi+zqJbQAIldNsLGyMcEWzv/9FIS3R/qy +8XDe24tsNlikfLMR0cN3f1+2JeANxdKz+bi4d9s3cXFH42AYTyS2dTd4uaNir73J +co4vzLuu2+QVUhkHM/tqty1LkCiCc/4YizWN26cEar7qwU02OxY2kTLvtkCJkUPg +8qKrBC7m8kwOFjQgrIfBLX7JZkcXFBGk8/ehJImr2BrIoVyxo/eMbcgByU/J7MT8 +rFEz0ciD0cmfHdRHNCk+y7AO+oMLKFjlKdw/fKifybYKu6boRhYPluV75Gp6SG12 +mAWl3G0eQh5C2hrgUve1g8Aae3g1LDj1H/1Joy7SWWO/gLCMk3PLNaaZlSJhZQNg ++y+TS/qanIA7AgMBAAGjYzBhMA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUtqeX +gj10hZv3PJ+TmpV5dVKMbUcwDwYDVR0TAQH/BAUwAwEB/zAfBgNVHSMEGDAWgBS2 +p5eCPXSFm/c8n5OalXl1UoxtRzANBgkqhkiG9w0BAQwFAAOCAgEAqMxhpr51nhVQ +pGv7qHBFfLp+sVr8WyP6Cnf4mHGCDG3gXkaqk/QeoMPhk9tLrbKmXauw1GLLXrtm +9S3ul0A8Yute1hTWjOKWi0FpkzXmuZlrYrShF2Y0pmtjxrlO8iLpWA1WQdH6DErw +M807u20hOq6OcrXDSvvpfeWxm4bu4uB9tPcy/SKE8YXJN3nptT+/XOR0so8RYgDd +GGah2XsjX/GO1WfoVNpbOms2b/mBsTNHM3dA+VKq3dSDz4V4mZqTuXNnQkYRIer+ +CqkbGmVps4+uFrb2S1ayLfmlyOw7YqPta9BO1UAJpB+Y1zqlklkg5LB9zVtzaL1t +xKITDmcZuI1CfmwMmm6gJC3VRRvcxAIU/oVbZZfKTpBQCHpCNfnqwmbU+AGuHrS+ +w6jv/naaoqYfRvaE7fzbzsQCzndILIyy7MMAo+wsVRjBfhnu4S/yrYObnqsZ38aK +L4x35bcF7DvB7L6Gs4a8wPfc5+pbrrLMtTWGS9DiP7bY+A4A7l3j941Y/8+LN+lj +X273CXE2whJdV/LItM3z7gLfEdxquVeEHVlNjM7IDiPCtyaaEBRx/pOyiriA8A4Q +ntOoUAw3gi/q4Iqd4Sw5/7W0cwDk90imc6y/st53BIe0o82bNSQ3+pCTE4FCxpgm +dTdmQRCsu/WU48IxK63nI1bMNSWSs1A= +-----END CERTIFICATE----- # CN=Telia Root CA v2,O=Telia Finland Oyj,C=FI # 242b69742fcb1e5b2abf98898b94572187544e5b4d9911786573621f6a74b82c -----BEGIN CERTIFICATE----- @@ -3740,28 +3792,6 @@ HmyW74cNxA9hi63ugyuV+I6ShHI56yDqg+2DzZduCLzrTia2cyvk0/ZM/iZx4mER dEr/VxqHD3VILs9RaRegAhJhldXRQLIQTO7ErBBDpqWeCtWVYpoNz4iCxTIM5Cuf ReYNnyicsbkqWletNw+vHX/bvZ8= -----END CERTIFICATE----- -# OU=Security Communication RootCA1,O=SECOM Trust.net,C=JP -# e75e72ed9f560eec6eb4800073a43fc3ad19195a392282017895974a99026b6c ------BEGIN CERTIFICATE----- -MIIDWjCCAkKgAwIBAgIBADANBgkqhkiG9w0BAQUFADBQMQswCQYDVQQGEwJKUDEY -MBYGA1UEChMPU0VDT00gVHJ1c3QubmV0MScwJQYDVQQLEx5TZWN1cml0eSBDb21t -dW5pY2F0aW9uIFJvb3RDQTEwHhcNMDMwOTMwMDQyMDQ5WhcNMjMwOTMwMDQyMDQ5 -WjBQMQswCQYDVQQGEwJKUDEYMBYGA1UEChMPU0VDT00gVHJ1c3QubmV0MScwJQYD -VQQLEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTEwggEiMA0GCSqGSIb3 -DQEBAQUAA4IBDwAwggEKAoIBAQCzs/5/022x7xZ8V6UMbXaKL0u/ZPtM7orw8yl8 -9f/uKuDp6bpbZCKamm8sOiZpUQWZJtzVHGpxxpp9Hp3dfGzGjGdnSj74cbAZJ6kJ -DKaVv0uMDPpVmDvY6CKhS3E4eayXkmmziX7qIWgGmBSWh9JhNrxtJ1aeV+7AwFb9 -Ms+k2Y7CI9eNqPPYJayX5HA49LY6tJ07lyZDo6G8SVlyTCMwhwFY9k6+HGhWZq/N -QV3Is00qVUarH9oe4kA92819uZKAnDfdDJZkndwi92SL32HeFZRSFaB9UslLqCHJ -xrHty8OVYNEP8Ktw+N/LTX7s1vqr2b1/VPKl6Xn62dZ2JChzAgMBAAGjPzA9MB0G -A1UdDgQWBBSgc0mZaNyFW2XjmygvV5+9M7wHSDALBgNVHQ8EBAMCAQYwDwYDVR0T -AQH/BAUwAwEB/zANBgkqhkiG9w0BAQUFAAOCAQEAaECpqLvkT115swW1F7NgE+vG -kl3g0dNq/vu+m22/xwVtWSDEHPC32oRYAmP6SBbvT6UL90qY8j+eG61Ha2POCEfr -Uj94nK9NrvjVT8+amCoQQTlSxN3Zmw7vkwGusi7KaEIkQmywszo+zenaSMQVy+n5 -Bw+SUEmK3TGXX8npN6o7WWWXlDLJs58+OmJYxUmtYg5xpTKqL8aJdkNAExNnPaJU -JRDL8Try2frbSVa7pv6nQTXD4IhhyYjH3zYQIphZ6rBK+1YWc26sTfcioU+tHXot -RSflMMFe8toTyyVCUZVHA4xsIcx0Qu1T/zOLjw9XARYvz6buyXAiFL39vmwLAw== ------END CERTIFICATE----- # OU=Security Communication RootCA2,O=SECOM Trust Systems CO.\,LTD.,C=JP # 513b2cecb810d4cde5dd85391adfc6c2dd60d87bb736d2b521484aa47a0ebef6 -----BEGIN CERTIFICATE----- From 405cb3bdea78b1b48ee79096733841247a944de0 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Wed, 7 Feb 2024 18:30:40 +0000 Subject: [PATCH 31/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I87d766a485104662172bf983d2a9deb085eddd99 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/562417 Reviewed-by: Than McIntosh Reviewed-by: Dmitri Shuralyov Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 18cb00a76c..9cde59c44e 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.10.0 // tagx:ignore - golang.org/x/sys v0.16.0 - golang.org/x/term v0.16.0 + golang.org/x/sys v0.17.0 + golang.org/x/term v0.17.0 ) require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index d89d678c13..55835d583b 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= -golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= -golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.17.0 h1:mkTF7LCd6WGJNL3K1Ad7kwxNfYAW6a8a8QqtMblp/4U= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 62c9f1799c91aef0744179032e30fda3761e79f7 Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Wed, 7 Feb 2024 10:41:04 -0800 Subject: [PATCH 32/95] x509roots/nss: manually exclude a confusingly constrained root Fixes golang/go#61963 Change-Id: I16920d160af74772ef5aa650d1274e07c3ca9adc Reviewed-on: https://go-review.googlesource.com/c/crypto/+/562475 Reviewed-by: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- x509roots/fallback/bundle.go | 28 ---------------------------- x509roots/nss/parser.go | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index a666f5f742..460c57b4d8 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -3078,34 +3078,6 @@ WL6ukK2YJ5f+AbGwUgC4TeQbIXQbfsDuXmkqJa9c1h3a0nnJ85cp4IaH3gRZD/FZ e9eiPZaGzPImNC1qkp2aGtAw4l1OBLBfiyB+d8E9lYLRRpo7PHi4b6HQDWSieB4p TpPDpFQUWw== -----END CERTIFICATE----- -# CN=TUBITAK Kamu SM SSL Kok Sertifikasi - Surum 1,OU=Kamu Sertifikasyon Merkezi - Kamu SM,O=Turkiye Bilimsel ve Teknolojik Arastirma Kurumu - TUBITAK,L=Gebze - Kocaeli,C=TR -# 46edc3689046d53a453fb3104ab80dcaec658b2660ea1629dd7e867990648716 ------BEGIN CERTIFICATE----- -MIIEYzCCA0ugAwIBAgIBATANBgkqhkiG9w0BAQsFADCB0jELMAkGA1UEBhMCVFIx -GDAWBgNVBAcTD0dlYnplIC0gS29jYWVsaTFCMEAGA1UEChM5VHVya2l5ZSBCaWxp -bXNlbCB2ZSBUZWtub2xvamlrIEFyYXN0aXJtYSBLdXJ1bXUgLSBUVUJJVEFLMS0w -KwYDVQQLEyRLYW11IFNlcnRpZmlrYXN5b24gTWVya2V6aSAtIEthbXUgU00xNjA0 -BgNVBAMTLVRVQklUQUsgS2FtdSBTTSBTU0wgS29rIFNlcnRpZmlrYXNpIC0gU3Vy -dW0gMTAeFw0xMzExMjUwODI1NTVaFw00MzEwMjUwODI1NTVaMIHSMQswCQYDVQQG -EwJUUjEYMBYGA1UEBxMPR2ViemUgLSBLb2NhZWxpMUIwQAYDVQQKEzlUdXJraXll -IEJpbGltc2VsIHZlIFRla25vbG9qaWsgQXJhc3Rpcm1hIEt1cnVtdSAtIFRVQklU -QUsxLTArBgNVBAsTJEthbXUgU2VydGlmaWthc3lvbiBNZXJrZXppIC0gS2FtdSBT -TTE2MDQGA1UEAxMtVFVCSVRBSyBLYW11IFNNIFNTTCBLb2sgU2VydGlmaWthc2kg -LSBTdXJ1bSAxMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAr3UwM6q7 -a9OZLBI3hNmNe5eA027n/5tQlT6QlVZC1xl8JoSNkvoBHToP4mQ4t4y86Ij5iySr -LqP1N+RAjhgleYN1Hzv/bKjFxlb4tO2KRKOrbEz8HdDc72i9z+SqzvBV96I01INr -N3wcwv61A+xXzry0tcXtAA9TNypN9E8Mg/uGz8v+jE69h/mniyFXnHrfA2eJLJ2X -YacQuFWQfw4tJzh03+f92k4S400VIgLI4OD8D62K18lUUMw7D8oWgITQUVbDjlZ/ -iSIzL+aFCr2lqBs23tPcLG07xxO9WSMs5uWk99gL7eqQQESolbuT1dCANLZGeA4f -AJNG4e7p+exPFwIDAQABo0IwQDAdBgNVHQ4EFgQUZT/HiobGPN08VFw1+DrtUgxH -V8gwDgYDVR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEL -BQADggEBACo/4fEyjq7hmFxLXs9rHmoJ0iKpEsdeV31zVmSAhHqT5Am5EM2fKifh -AHe+SMg1qIGf5LgsyX8OsNJLN13qudULXjS99HMpw+0mFZx+CFOKWI3QSyjfwbPf -IPP54+M638yclNhOT8NrF7f3cuitZjO1JVOr4PhMqZ398g26rrnZqsZr+ZO7rqu4 -lzwDGrpDxpa5RXI4s6ehlj2Re37AIVNMh+3yC1SVUZPVIqUNivGTDj5UDrDYyU7c -8jEyVupk+eq1nRZmQnLzf9OxMUP8pI4X8W0jq5Rm+K37DwhuJi1/FwcJsoz7UMCf -lo3Ptv0AnVoUmr8CRPXBwp8iXqIPoeM= ------END CERTIFICATE----- # CN=TWCA Global Root CA,OU=Root CA,O=TAIWAN-CA,C=TW # 59769007f7685d0fcd50872f9f95d5755a5b2b457d81f3692b610a98672f0e1b -----BEGIN CERTIFICATE----- diff --git a/x509roots/nss/parser.go b/x509roots/nss/parser.go index 1af3e0ae46..feca766e18 100644 --- a/x509roots/nss/parser.go +++ b/x509roots/nss/parser.go @@ -147,6 +147,20 @@ func parseTrustClass(s *bufio.Scanner) ([sha1.Size]byte, *trustObj, error) { return h, to, nil } +// manualExclusions contains a map of SHA1 fingerprints of roots that we manually exclude +// from the bundle for various reasons. +var manualExclusions = map[string]bool{ + // TUBITAK Kamu SM SSL Kok Sertifikasi - Surum 1 + // We exclude this root because mozilla manually constrains this root to + // issue names under .tr, but this information is only encoded in the CCADB + // IncludedCACertificateReport, in a field the format of which is + // undocumented, and is only used for this particular certificate. Rather + // than adding special parsing for this, we skip it. When code constraint + // support is available, we may also want to simply add a manual constraint, + // rather than a manual exclusion. + "3143649becce27eced3a3f0b8f0de4e891ddeeca": true, +} + // Parse parses a NSS certdata.txt formatted file, returning only // trusted serverAuth roots, as well as any additional constraints. This parser // is very opinionated, only returning roots that are currently trusted for @@ -248,6 +262,9 @@ func Parse(r io.Reader) ([]*Certificate, error) { if !e.trust.trusted { continue } + if manualExclusions[fmt.Sprintf("%x", h)] { + continue + } nssCert := &Certificate{X509: e.cert.c} if e.cert.DistrustAfter != nil { nssCert.Constraints = append(nssCert.Constraints, DistrustAfter(*e.cert.DistrustAfter)) From 1c981e6f81fc4d43487d6ad84d6b499fa55099e8 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Thu, 8 Feb 2024 18:59:26 +0100 Subject: [PATCH 33/95] ssh/test: don't use DSA keys in integrations tests, update test RSA key on RHEL 9 OpenSSH does not support DSA keys and RSA keys with size less than 2048 bits, furthermore signing with ssh-rsa (SHA-1 signatures) and ssh-dss is not allowed, therefore: 1) replaced the 1024-bit RSA key used in the test with a new 2048-bit one 2) removed DSA key from itegration tests 3) allowed signature errors using ssh-rsa in agent integration tests, we also check SHA-2 variants that are not skipped Fixes golang/go#65581 Change-Id: I54bf997b61ef4d91d38eb624275737ba7291bb20 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/562755 Reviewed-by: Roland Shoemaker Reviewed-by: David Chase Reviewed-by: Filippo Valsorda Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI --- ssh/agent/client_test.go | 21 ++++++++++++------ ssh/keys_test.go | 4 ++-- ssh/test/cert_test.go | 2 +- ssh/test/session_test.go | 1 - ssh/testdata/keys.go | 46 +++++++++++++++++++++++++--------------- 5 files changed, 46 insertions(+), 28 deletions(-) diff --git a/ssh/agent/client_test.go b/ssh/agent/client_test.go index fdc8000654..ae03df1a63 100644 --- a/ssh/agent/client_test.go +++ b/ssh/agent/client_test.go @@ -164,15 +164,23 @@ func testAgentInterface(t *testing.T, agent ExtendedAgent, key interface{}, cert data := []byte("hello") sig, err := agent.Sign(pubKey, data) if err != nil { - t.Fatalf("Sign(%s): %v", pubKey.Type(), err) - } - - if err := pubKey.Verify(data, sig); err != nil { - t.Fatalf("Verify(%s): %v", pubKey.Type(), err) + t.Logf("sign failed with key type %q", pubKey.Type()) + // In integration tests ssh-dss and ssh-rsa (SHA1 signatures) may be + // disabled for security reasons, we check SHA-2 variants later. + if pubKey.Type() != ssh.KeyAlgoDSA && pubKey.Type() != ssh.KeyAlgoRSA && pubKey.Type() != ssh.CertAlgoRSAv01 { + t.Fatalf("Sign(%s): %v", pubKey.Type(), err) + } + } else { + if err := pubKey.Verify(data, sig); err != nil { + t.Logf("verify failed with key type %q", pubKey.Type()) + if pubKey.Type() != ssh.KeyAlgoRSA { + t.Fatalf("Verify(%s): %v", pubKey.Type(), err) + } + } } // For tests on RSA keys, try signing with SHA-256 and SHA-512 flags - if pubKey.Type() == "ssh-rsa" { + if pubKey.Type() == ssh.KeyAlgoRSA { sshFlagTest := func(flag SignatureFlags, expectedSigFormat string) { sig, err = agent.SignWithFlags(pubKey, data, flag) if err != nil { @@ -185,7 +193,6 @@ func testAgentInterface(t *testing.T, agent ExtendedAgent, key interface{}, cert t.Fatalf("Verify(%s): %v", pubKey.Type(), err) } } - sshFlagTest(0, ssh.KeyAlgoRSA) sshFlagTest(SignatureFlagRsaSha256, ssh.KeyAlgoRSASHA256) sshFlagTest(SignatureFlagRsaSha512, ssh.KeyAlgoRSASHA512) } diff --git a/ssh/keys_test.go b/ssh/keys_test.go index 3e18488f34..36e1857039 100644 --- a/ssh/keys_test.go +++ b/ssh/keys_test.go @@ -610,7 +610,7 @@ func TestKnownHostsParsing(t *testing.T) { func TestFingerprintLegacyMD5(t *testing.T) { pub, _ := getTestKey() fingerprint := FingerprintLegacyMD5(pub) - want := "fb:61:6d:1a:e3:f0:95:45:3c:a0:79:be:4a:93:63:66" // ssh-keygen -lf -E md5 rsa + want := "b7:ef:d3:d5:89:29:52:96:9f:df:47:41:4d:15:37:f4" // ssh-keygen -lf -E md5 rsa if fingerprint != want { t.Errorf("got fingerprint %q want %q", fingerprint, want) } @@ -619,7 +619,7 @@ func TestFingerprintLegacyMD5(t *testing.T) { func TestFingerprintSHA256(t *testing.T) { pub, _ := getTestKey() fingerprint := FingerprintSHA256(pub) - want := "SHA256:Anr3LjZK8YVpjrxu79myrW9Hrb/wpcMNpVvTq/RcBm8" // ssh-keygen -lf rsa + want := "SHA256:fi5+D7UmDZDE9Q2sAVvvlpcQSIakN4DERdINgXd2AnE" // ssh-keygen -lf rsa if fingerprint != want { t.Errorf("got fingerprint %q want %q", fingerprint, want) } diff --git a/ssh/test/cert_test.go b/ssh/test/cert_test.go index 15c44fb58e..9555a097b0 100644 --- a/ssh/test/cert_test.go +++ b/ssh/test/cert_test.go @@ -19,7 +19,7 @@ func TestCertLogin(t *testing.T) { s := newServer(t) // Use a key different from the default. - clientKey := testSigners["dsa"] + clientKey := testSigners["ed25519"] caAuthKey := testSigners["ecdsa"] cert := &ssh.Certificate{ Key: clientKey.PublicKey(), diff --git a/ssh/test/session_test.go b/ssh/test/session_test.go index c128b2ae51..7925e95278 100644 --- a/ssh/test/session_test.go +++ b/ssh/test/session_test.go @@ -448,7 +448,6 @@ func TestKeyExchanges(t *testing.T) { func TestClientAuthAlgorithms(t *testing.T) { for _, key := range []string{ "rsa", - "dsa", "ecdsa", "ed25519", } { diff --git a/ssh/testdata/keys.go b/ssh/testdata/keys.go index 69f9eef4de..6e48841b67 100644 --- a/ssh/testdata/keys.go +++ b/ssh/testdata/keys.go @@ -46,19 +46,31 @@ FFlRjzoB3Oxu8UQgb+MWPedtH9XYBbg9biz4jJLkXQ== -----END EC PRIVATE KEY----- `), "rsa": []byte(`-----BEGIN RSA PRIVATE KEY----- -MIICXAIBAAKBgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2 -a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8 -Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQIDAQAB -AoGAJMCk5vqfSRzyXOTXLGIYCuR4Kj6pdsbNSeuuRGfYBeR1F2c/XdFAg7D/8s5R -38p/Ih52/Ty5S8BfJtwtvgVY9ecf/JlU/rl/QzhG8/8KC0NG7KsyXklbQ7gJT8UT -Ojmw5QpMk+rKv17ipDVkQQmPaj+gJXYNAHqImke5mm/K/h0CQQDciPmviQ+DOhOq -2ZBqUfH8oXHgFmp7/6pXw80DpMIxgV3CwkxxIVx6a8lVH9bT/AFySJ6vXq4zTuV9 -6QmZcZzDAkEA2j/UXJPIs1fQ8z/6sONOkU/BjtoePFIWJlRxdN35cZjXnBraX5UR -fFHkePv4YwqmXNqrBOvSu+w2WdSDci+IKwJAcsPRc/jWmsrJW1q3Ha0hSf/WG/Bu -X7MPuXaKpP/DkzGoUmb8ks7yqj6XWnYkPNLjCc8izU5vRwIiyWBRf4mxMwJBAILa -NDvRS0rjwt6lJGv7zPZoqDc65VfrK2aNyHx2PgFyzwrEOtuF57bu7pnvEIxpLTeM -z26i6XVMeYXAWZMTloMCQBbpGgEERQpeUknLBqUHhg/wXF6+lFA+vEGnkY+Dwab2 -KCXFGd+SQ5GdUcEMe9isUH6DYj/6/yCDoFrXXmpQb+M= +MIIEpQIBAAKCAQEAnuozKMtcQkIImZGSe4IujS4+Lkas9jmlBivziWGU3waivkpU +vYspgJbh7vSvnHOPtKscdIJ+3UUyViDUoM73GumsmHvfeRCoA9YROZK4fQR9G0a1 +wfoRqsrJXGToCzTvr/I2KIwpUG0bRE9rUvsW+JN9xgri+cIJWtu/dGYDkILO4bkF +IxtEvHNVvhGLenyOHFhPw3hAZ7/bKq8kvKzm9D2zOllHe1wWncMkhVmEFF9Houeh +jbddmeIAAxBpRUFfzp1dD7503ADBlJdK306D4CeI4KIFiqE1VrmfcMgP8fti0S0b +4JtmvevYoPd+/wB9ItFqvhc6nyuxF0PfWH+SvwIDAQABAoIBAQCCcpNeSFjKVvRC +Q1nwIrPd1njaec+fK0CIqWl3e2++B+9trwySrvp5gOGjyp2hGsd7Mf7gsQI81oF0 +a+y+uEXlhK3WWdDey0pwI7ft/7+LeDTOQCQRQBpijaXvPzGviVu7nWLRtARx7a41 +S9A4xL5dfI0BFYyuIpaVS8+EV/1TEJIbceZ5q5RBlARA1rc+nBvjygNzYdRq9Rao +yyehvnXZ7pQrATnwofPolbZNseW2Q9sRMmtm1E60XJJ433P7nFbxXtsMAYgxWSQc +V/92iRPYeD7sN/b7qulLEgC6e8el2gLGIB9aQyG7B6KFqloqvx/ymYs07+bWIQCU +6i9y7LABAoGBANKB2Rs0lF5c+gpEE3w0AWoxGyL04TZECtl2hQ/b2jc2n8hLqLhv +zNIKN+xzEP6j0ijXajjEMLiQH10qQ6+Plv8C7o8GJC1V/Oj4u4kbPqfVV1kSxuWm +FBjz6+c8VPbEBgXq5lCMEgC2Ii8XVRoyd3iSSOh+LIMBu/Br3JEsjJq/AoGBAMFC +CvODTiThrZo8v765dRSHrLOvB4jZOPrEKLWECLaQDQpuhgzZhyWm9zFfxGB+LWE9 +R9pU6ZCFPtPfd4cRZ9cezrp+lgdrqjcUX/2ZLLMk21WXFuRaw/4KTlKNsMY6lbAK +qVkUFWIZWaCQ8bdVCP48polipRNmN9mAHsIhIwgBAoGBAM6yn1qeS11I0GAKLlPT +wNvjsfCmIQmm0DxtqwRCbUevxD7pQ5cueCB51iW/ap2OgEqIEo4A3pIrOhDB8kpN +pQdrepFHh3hYqYicy5A6B1DHJAibbl+Krss9n5KjZA4VtpBS8al/kCHQtUomD/M0 +QKlMgnh/g/dzWXYegyqtYraDAoGAGbeBH5B8iJnjcR/eYDHrq5S2XZ7QANzvISeT +RzxPsIOQyK+WdQVJX7BNOqvExRZlUYhHFH2yKwIgLy+Qh0/Aora9ycFok4o3N2cl +suh8M0aXTVdyu2Z8qESU0ZV7TZWkL63rhSgQBGLdM2m2ULAnJzXI74VJ9D/o9K+A +6FJiiAECgYEAujJ/hKxVKEUvxwloGSDhKCUH86+7UOkb/EM2zZFrlPYAz1VcCwr3 +K14r8BtmLFXuLXOlACpoH0Wf4uia+t6n8m9JK3mvpJ7fempAsptP3AdZMQFe7xUm +SXEGQBYxcyS5Q+ncwWZuPgby5wJ9D4Fd6TQH+wwG52sFugt/fGxbPug= -----END RSA PRIVATE KEY----- `), "pkcs8": []byte(`-----BEGIN PRIVATE KEY----- @@ -190,17 +202,17 @@ var SSHCertificates = map[string][]byte{ // // 2. Assumes "ca" key above in file named "ca", sign a cert for "rsa.pub": // ssh-keygen -s ca -h -n host.example.com -V +500w -I host.example.com-key rsa.pub - "rsa": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgLjYqmmuTSEmjVhSfLQphBSTJMLwIZhRgmpn8FHKLiEIAAAADAQABAAAAgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQAAAAAAAAAAAAAAAgAAABRob3N0LmV4YW1wbGUuY29tLWtleQAAABQAAAAQaG9zdC5leGFtcGxlLmNvbQAAAABZHN8UAAAAAGsjIYUAAAAAAAAAAAAAAAAAAAEXAAAAB3NzaC1yc2EAAAADAQABAAABAQC+D11D0hEbn2Vglv4YRJ8pZNyHjIGmvth3DWOQrq++2vH2MujmGQDxfr4SVE9GpMBlKU3lwGbpgIBxAg6yZcNSfo6PWVU9ACg6NMFO+yMzc2MaG+/naQdNjSewywF5j2rkNO2XOaViRVSrZroe2B/aY2LTV0jDl8nu5NOjwRs1/s7SLe5z1rw/X0dpmXk0qJY3gQhmR8HZZ1dhEkJUGwaBCPd0T8asSYf1Ag2rUD4aQ28r3q69mbwfWOOa6rMemVZruUV5dzHwVNVNtVv+ImtnYtz8m8g+K0plaGptHn3KsaOnASkh3tujhaE7kvc4HR9Igli9+76jhZie3h/dTN5zAAABDwAAAAdzc2gtcnNhAAABALeDea+60H6xJGhktAyosHaSY7AYzLocaqd8hJQjEIDifBwzoTlnBmcK9CxGhKuaoJFThdCLdaevCeOSuquh8HTkf+2ebZZc/G5T+2thPvPqmcuEcmMosWo+SIjYhbP3S6KD49aLC1X0kz8IBQeauFvURhkZ5ZjhA1L4aQYt9NjL73nqOl8PplRui+Ov5w8b4ldul4zOvYAFrzfcP6wnnXk3c1Zzwwf5wynD5jakO8GpYKBuhM7Z4crzkKSQjU3hla7xqgfomC5Gz4XbR2TNjcQiRrJQ0UlKtX3X3ObRCEhuvG0Kzjklhv+Ddw6txrhKjMjiSi/Yyius/AE8TmC1p4U= host.example.com + "rsa": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgWcP1x+t9PQTxkt/fRa8v5HXz0FFW/fwN3mpR5jdGo3UAAAADAQABAAABAQCe6jMoy1xCQgiZkZJ7gi6NLj4uRqz2OaUGK/OJYZTfBqK+SlS9iymAluHu9K+cc4+0qxx0gn7dRTJWINSgzvca6ayYe995EKgD1hE5krh9BH0bRrXB+hGqyslcZOgLNO+v8jYojClQbRtET2tS+xb4k33GCuL5wgla2790ZgOQgs7huQUjG0S8c1W+EYt6fI4cWE/DeEBnv9sqryS8rOb0PbM6WUd7XBadwySFWYQUX0ei56GNt12Z4gADEGlFQV/OnV0PvnTcAMGUl0rfToPgJ4jgogWKoTVWuZ9wyA/x+2LRLRvgm2a969ig937/AH0i0Wq+FzqfK7EXQ99Yf5K/AAAAAAAAAAAAAAACAAAAFGhvc3QuZXhhbXBsZS5jb20ta2V5AAAAFAAAABBob3N0LmV4YW1wbGUuY29tAAAAAGXEzUQAAAAAd8sPtQAAAAAAAAAAAAAAAAAAARcAAAAHc3NoLXJzYQAAAAMBAAEAAAEBAL4PXUPSERufZWCW/hhEnylk3IeMgaa+2HcNY5Cur77a8fYy6OYZAPF+vhJUT0akwGUpTeXAZumAgHECDrJlw1J+jo9ZVT0AKDo0wU77IzNzYxob7+dpB02NJ7DLAXmPauQ07Zc5pWJFVKtmuh7YH9pjYtNXSMOXye7k06PBGzX+ztIt7nPWvD9fR2mZeTSoljeBCGZHwdlnV2ESQlQbBoEI93RPxqxJh/UCDatQPhpDbyverr2ZvB9Y45rqsx6ZVmu5RXl3MfBU1U21W/4ia2di3PybyD4rSmVoam0efcqxo6cBKSHe26OFoTuS9zgdH0iCWL37vqOFmJ7eH91M3nMAAAEPAAAAB3NzaC1yc2EAAAEAs8/78EsftS/0+6CfUAtRhmgpOPZVH8XJc2o4Qx/lQCBg/uB2sucGGx/5BuFUwAjhMNQ1UejJ6+AbNgZNlH7LIssGxpcW+eKQXLfd9KwWNPCU3DqZGiMBsYESNtVLjKAMuvJmxKFFu4rceyya+GZfsQFJR7G++XK0cQQ4rgDSnQo3pxQnHwguFWeHtwjdOj0helYER4XDKjKQlo0SxI1nVo7n1jkl8QghdxX6HKia6MP7MqstgNujTvrCEUJ6L3YflD/SJ00Y3ySD9+5hqvfqrDh4c4rRuc4+k0e3fYZwyWO2NNtKHRBOZAgW8zzfR3G99YdTw8N8+2oXeAdvB9k0tA== host.example.com `), - "rsa-sha2-256": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgOyK28gunJkM60qp4EbsYAjgbUsyjS8u742OLjipIgc0AAAADAQABAAAAgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQAAAAAAAAAAAAAAAgAAABRob3N0LmV4YW1wbGUuY29tLWtleQAAABQAAAAQaG9zdC5leGFtcGxlLmNvbQAAAABeSMJ4AAAAAHBPBLwAAAAAAAAAAAAAAAAAAAEXAAAAB3NzaC1yc2EAAAADAQABAAABAQC+D11D0hEbn2Vglv4YRJ8pZNyHjIGmvth3DWOQrq++2vH2MujmGQDxfr4SVE9GpMBlKU3lwGbpgIBxAg6yZcNSfo6PWVU9ACg6NMFO+yMzc2MaG+/naQdNjSewywF5j2rkNO2XOaViRVSrZroe2B/aY2LTV0jDl8nu5NOjwRs1/s7SLe5z1rw/X0dpmXk0qJY3gQhmR8HZZ1dhEkJUGwaBCPd0T8asSYf1Ag2rUD4aQ28r3q69mbwfWOOa6rMemVZruUV5dzHwVNVNtVv+ImtnYtz8m8g+K0plaGptHn3KsaOnASkh3tujhaE7kvc4HR9Igli9+76jhZie3h/dTN5zAAABFAAAAAxyc2Etc2hhMi0yNTYAAAEAbG4De/+QiqopPS3O1H7ySeEUCY56qmdgr02sFErnihdXPDaWXUXxacvJHaEtLrSTSaPL/3v3iKvjLWDOHaQ5c+cN9J7Tqzso7RQCXZD2nK9bwCUyBoiDyBCRe8w4DQEtfL5okpVzQsSAiojQ8hBohMOpy3gFfXrdm4PVC1ZKqlZh4fAc7ajieRq/Tpq2xOLdHwxkcgPNR83WVHva6K9/xjev/5n227/gkHo0qbGs8YYDOFXIEhENi+B23IzxdNVieWdyQpYpe0C2i95Jhyo0wJmaFY2ArruTS+D1jGQQpMPvAQRy26/A5hI83GLhpwyhrN/M8wCxzAhyPL6Ieuh5tQ== host.example.com + "rsa-sha2-256": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAg4+hKHVPKv183MU/Q7XD/mzDBFSc2YY3eraltxLMGJo0AAAADAQABAAABAQCe6jMoy1xCQgiZkZJ7gi6NLj4uRqz2OaUGK/OJYZTfBqK+SlS9iymAluHu9K+cc4+0qxx0gn7dRTJWINSgzvca6ayYe995EKgD1hE5krh9BH0bRrXB+hGqyslcZOgLNO+v8jYojClQbRtET2tS+xb4k33GCuL5wgla2790ZgOQgs7huQUjG0S8c1W+EYt6fI4cWE/DeEBnv9sqryS8rOb0PbM6WUd7XBadwySFWYQUX0ei56GNt12Z4gADEGlFQV/OnV0PvnTcAMGUl0rfToPgJ4jgogWKoTVWuZ9wyA/x+2LRLRvgm2a969ig937/AH0i0Wq+FzqfK7EXQ99Yf5K/AAAAAAAAAAAAAAACAAAAFGhvc3QuZXhhbXBsZS5jb20ta2V5AAAAFAAAABBob3N0LmV4YW1wbGUuY29tAAAAAGXEzYAAAAAAd8sP4wAAAAAAAAAAAAAAAAAAARcAAAAHc3NoLXJzYQAAAAMBAAEAAAEBAL4PXUPSERufZWCW/hhEnylk3IeMgaa+2HcNY5Cur77a8fYy6OYZAPF+vhJUT0akwGUpTeXAZumAgHECDrJlw1J+jo9ZVT0AKDo0wU77IzNzYxob7+dpB02NJ7DLAXmPauQ07Zc5pWJFVKtmuh7YH9pjYtNXSMOXye7k06PBGzX+ztIt7nPWvD9fR2mZeTSoljeBCGZHwdlnV2ESQlQbBoEI93RPxqxJh/UCDatQPhpDbyverr2ZvB9Y45rqsx6ZVmu5RXl3MfBU1U21W/4ia2di3PybyD4rSmVoam0efcqxo6cBKSHe26OFoTuS9zgdH0iCWL37vqOFmJ7eH91M3nMAAAEUAAAADHJzYS1zaGEyLTI1NgAAAQA/ByIegNZYJRRl413S/8LxGvTZnbxsPwaluoJ/54niGZV9P28THz7d9jXfSHPjalhH93jNPfTYXvI4opnDC37ua1Nu8KKfk40IWXnnDdZLWraUxEidIzhmfVtz8kGdGoFQ8H0EzubL7zKNOTlfSfOoDlmQVOuxT/+eh2mEp4ri0/+8J1mLfLBr8tREX0/iaNjK+RKdcyTMicKursAYMCDdu8vlaphxea+ocyHM9izSX/l33t44V13ueTqIOh2Zbl2UE2k+jk+0dc1CmV0SEoiWiIyt8TRM4yQry1vPlQLsrf28sYM/QMwnhCVhyZO3vs5F25aQWrB9d51VEzBW9/fd host.example.com `), - "rsa-sha2-512": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgFGv4IpXfs4L/Y0b3rmUdPFhWoUrVnXuPxXr6aHGs7wgAAAADAQABAAAAgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQAAAAAAAAAAAAAAAgAAABRob3N0LmV4YW1wbGUuY29tLWtleQAAABQAAAAQaG9zdC5leGFtcGxlLmNvbQAAAABeSMRYAAAAAHBPBp4AAAAAAAAAAAAAAAAAAAEXAAAAB3NzaC1yc2EAAAADAQABAAABAQC+D11D0hEbn2Vglv4YRJ8pZNyHjIGmvth3DWOQrq++2vH2MujmGQDxfr4SVE9GpMBlKU3lwGbpgIBxAg6yZcNSfo6PWVU9ACg6NMFO+yMzc2MaG+/naQdNjSewywF5j2rkNO2XOaViRVSrZroe2B/aY2LTV0jDl8nu5NOjwRs1/s7SLe5z1rw/X0dpmXk0qJY3gQhmR8HZZ1dhEkJUGwaBCPd0T8asSYf1Ag2rUD4aQ28r3q69mbwfWOOa6rMemVZruUV5dzHwVNVNtVv+ImtnYtz8m8g+K0plaGptHn3KsaOnASkh3tujhaE7kvc4HR9Igli9+76jhZie3h/dTN5zAAABFAAAAAxyc2Etc2hhMi01MTIAAAEAnF4fVj6mm+UFeNCIf9AKJCv9WzymjjPvzzmaMWWkPWqoV0P0m5SiYfvbY9SbA73Blpv8SOr0DmpublF183kodREia4KyVuC8hLhSCV2Y16hy9MBegOZMepn80w+apj7Rn9QCz5OfEakDdztp6OWTBtqxnZFcTQ4XrgFkNWeWRElGdEvAVNn2WHwHi4EIdz0mdv48Imv5SPlOuW862ZdFG4Do1dUfDIiGsBofLlgcyIYlf+eNHul6sBeUkuwFxisMpI5DQzNp8PX1g/QJA2wzwT674PTqDXNttKjyh50Fdr4sXxm9Gz1+jVLoESvFNa55ERdSyAqNu4wTy11MZsWwSA== host.example.com + "rsa-sha2-512": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgylCoUX+0nYXGG9k/KEepcgEd22921eUwSQsYuQXKOswAAAADAQABAAABAQCe6jMoy1xCQgiZkZJ7gi6NLj4uRqz2OaUGK/OJYZTfBqK+SlS9iymAluHu9K+cc4+0qxx0gn7dRTJWINSgzvca6ayYe995EKgD1hE5krh9BH0bRrXB+hGqyslcZOgLNO+v8jYojClQbRtET2tS+xb4k33GCuL5wgla2790ZgOQgs7huQUjG0S8c1W+EYt6fI4cWE/DeEBnv9sqryS8rOb0PbM6WUd7XBadwySFWYQUX0ei56GNt12Z4gADEGlFQV/OnV0PvnTcAMGUl0rfToPgJ4jgogWKoTVWuZ9wyA/x+2LRLRvgm2a969ig937/AH0i0Wq+FzqfK7EXQ99Yf5K/AAAAAAAAAAAAAAACAAAAFGhvc3QuZXhhbXBsZS5jb20ta2V5AAAAFAAAABBob3N0LmV4YW1wbGUuY29tAAAAAGXEzbwAAAAAd8sQBAAAAAAAAAAAAAAAAAAAARcAAAAHc3NoLXJzYQAAAAMBAAEAAAEBAL4PXUPSERufZWCW/hhEnylk3IeMgaa+2HcNY5Cur77a8fYy6OYZAPF+vhJUT0akwGUpTeXAZumAgHECDrJlw1J+jo9ZVT0AKDo0wU77IzNzYxob7+dpB02NJ7DLAXmPauQ07Zc5pWJFVKtmuh7YH9pjYtNXSMOXye7k06PBGzX+ztIt7nPWvD9fR2mZeTSoljeBCGZHwdlnV2ESQlQbBoEI93RPxqxJh/UCDatQPhpDbyverr2ZvB9Y45rqsx6ZVmu5RXl3MfBU1U21W/4ia2di3PybyD4rSmVoam0efcqxo6cBKSHe26OFoTuS9zgdH0iCWL37vqOFmJ7eH91M3nMAAAEUAAAADHJzYS1zaGEyLTUxMgAAAQADOjYUNxzwYZ9O3zjjZSKhX7Ix/vUBq8lIltBRckbKJtcjn2/qqtaeZK2ijkbMlnPFCJ59U+Z6m4DMU6gxYF8R9/WJrANlWYNQ4+52fXjE/FPDdJkwB/kPWABt+ffEiM1HfzP4/zXgTtOJ0GogEzTYoMSrhAlpdKACUaC9nCQ7gjmO+owGkrB3ZbyQS4gltqQZjiBNqF0P/vxmXP+0Kx66/ei8JNYUpEPCXT4ZhLZmsVptaYD1gpvc2i5T5LnjRrsvf4Q6EKR+gwJEjlwnylhH5+h+ZU/KXydkw1hhEVP+ZiIBkVo9nN78Qb/VMJum4Fdoltuyh/9k2lMDeyDuRVh7 host.example.com `), // Assumes "ca" key above in file named "ca", sign a user cert for "rsa.pub" // using "testcertificate" as principal: // // ssh-keygen -s ca -I username -n testcertificate rsa.pub - "rsa-user-testcertificate": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgr0MnhSf+KkQWEQmweJOGsJfOrUt80pQZDaI9YiwSfDUAAAADAQABAAAAgQC8A6FGHDiWCSREAXCq6yBfNVr0xCVG2CzvktFNRpue+RXrGs/2a6ySEJQb3IYquw7HlJgu6fg3WIWhOmHCjfpG0PrL4CRwbqQ2LaPPXhJErWYejcD8Di00cF3677+G10KMZk9RXbmHtuBFZT98wxg8j+ZsBMqGM1+7yrWUvynswQAAAAAAAAAAAAAAAQAAAAh1c2VybmFtZQAAABMAAAAPdGVzdGNlcnRpZmljYXRlAAAAAAAAAAD//////////wAAAAAAAACCAAAAFXBlcm1pdC1YMTEtZm9yd2FyZGluZwAAAAAAAAAXcGVybWl0LWFnZW50LWZvcndhcmRpbmcAAAAAAAAAFnBlcm1pdC1wb3J0LWZvcndhcmRpbmcAAAAAAAAACnBlcm1pdC1wdHkAAAAAAAAADnBlcm1pdC11c2VyLXJjAAAAAAAAAAAAAAEXAAAAB3NzaC1yc2EAAAADAQABAAABAQC+D11D0hEbn2Vglv4YRJ8pZNyHjIGmvth3DWOQrq++2vH2MujmGQDxfr4SVE9GpMBlKU3lwGbpgIBxAg6yZcNSfo6PWVU9ACg6NMFO+yMzc2MaG+/naQdNjSewywF5j2rkNO2XOaViRVSrZroe2B/aY2LTV0jDl8nu5NOjwRs1/s7SLe5z1rw/X0dpmXk0qJY3gQhmR8HZZ1dhEkJUGwaBCPd0T8asSYf1Ag2rUD4aQ28r3q69mbwfWOOa6rMemVZruUV5dzHwVNVNtVv+ImtnYtz8m8g+K0plaGptHn3KsaOnASkh3tujhaE7kvc4HR9Igli9+76jhZie3h/dTN5zAAABFAAAAAxyc2Etc2hhMi01MTIAAAEAFuA+67KvnlmcodIp0Lv4mR9UW/CHghAaN1csBJTkI8mx3wXKyIPTsS2uXboEhWD0a7S9gps2SEwC5m6E3kV2Rzg7aH1S03GZqMvVlK2VHe7fzuoW2yOKk6yEPjeTF0pKCFbUQ6mce8pRpD/zdvjG0Z287XM3c3Axlrn7qq7TS0MDTjEZ/dsUNFHxep3co/HuAsWVWPVDItr/FPnvZ6WVH1yc8N/AJn0gLHobkGgug22vI9rNIge1wrnXxU9BUSouzkau/PQsrCQapnn+I1H7HaQt0wdk45nxMP+ags+HRI9qpX/p8WDn6+zpqYqN/nfw2aoytyaJqhsV32Teuqtrgg== rsa.pub + "rsa-user-testcertificate": []byte(`ssh-rsa-cert-v01@openssh.com AAAAHHNzaC1yc2EtY2VydC12MDFAb3BlbnNzaC5jb20AAAAgEKR9xnhXkbi/pgP669VjBH6XVTYR0yx1wunCtOIjzjUAAAADAQABAAABAQCe6jMoy1xCQgiZkZJ7gi6NLj4uRqz2OaUGK/OJYZTfBqK+SlS9iymAluHu9K+cc4+0qxx0gn7dRTJWINSgzvca6ayYe995EKgD1hE5krh9BH0bRrXB+hGqyslcZOgLNO+v8jYojClQbRtET2tS+xb4k33GCuL5wgla2790ZgOQgs7huQUjG0S8c1W+EYt6fI4cWE/DeEBnv9sqryS8rOb0PbM6WUd7XBadwySFWYQUX0ei56GNt12Z4gADEGlFQV/OnV0PvnTcAMGUl0rfToPgJ4jgogWKoTVWuZ9wyA/x+2LRLRvgm2a969ig937/AH0i0Wq+FzqfK7EXQ99Yf5K/AAAAAAAAAAAAAAABAAAACHVzZXJuYW1lAAAAEwAAAA90ZXN0Y2VydGlmaWNhdGUAAAAAAAAAAP//////////AAAAAAAAAIIAAAAVcGVybWl0LVgxMS1mb3J3YXJkaW5nAAAAAAAAABdwZXJtaXQtYWdlbnQtZm9yd2FyZGluZwAAAAAAAAAWcGVybWl0LXBvcnQtZm9yd2FyZGluZwAAAAAAAAAKcGVybWl0LXB0eQAAAAAAAAAOcGVybWl0LXVzZXItcmMAAAAAAAAAAAAAARcAAAAHc3NoLXJzYQAAAAMBAAEAAAEBAL4PXUPSERufZWCW/hhEnylk3IeMgaa+2HcNY5Cur77a8fYy6OYZAPF+vhJUT0akwGUpTeXAZumAgHECDrJlw1J+jo9ZVT0AKDo0wU77IzNzYxob7+dpB02NJ7DLAXmPauQ07Zc5pWJFVKtmuh7YH9pjYtNXSMOXye7k06PBGzX+ztIt7nPWvD9fR2mZeTSoljeBCGZHwdlnV2ESQlQbBoEI93RPxqxJh/UCDatQPhpDbyverr2ZvB9Y45rqsx6ZVmu5RXl3MfBU1U21W/4ia2di3PybyD4rSmVoam0efcqxo6cBKSHe26OFoTuS9zgdH0iCWL37vqOFmJ7eH91M3nMAAAEUAAAADHJzYS1zaGEyLTUxMgAAAQCKVn2S7FJYhXTRVbcz1Di1HLz1g5Yae5WBhd0Tg471XkNw7ylcCK23Wnrzj1GxrW0oWCCGHROtUnxQXei1xNWt8HONN+eeafrJSZJR6ald3Yd4OveXlHNT6mEDPgqRj4B56OPoY33LzpaFlQZlZ6U9KXySshNaCTjVp3ojTj6uPNxcuOnG9O5emEPC2eaM4QYsz4cqHNJ9SWWEu+HIQgpx5SM12qcrq/KhN6WJhG3edL1YAsxkf1/THEcfi6wvsHi+DKewJ5hZ876//japjHBKA9SB7gsCrLx3m+0XI8TkWUZTZJHETJHHVjJN8xjRvMv5gWKLvRsz7ScmnN1+ckL6 rsa.pub `), } From 1a865804d5a867febcb0162512f4e5d947ba641e Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Fri, 19 Jan 2024 15:50:36 -0600 Subject: [PATCH 34/95] x/crypto/internal/poly1305: improve sum_ppc64le.s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This contains a few minor improvements to sum_ppc64le.s which result in up to 10% performance improvement for some of the benchmarks in this directory. - ADDZE followed by ADD can be combined into ADDE - PCALIGN added to the loop - Eliminate a few unnecessary register moves goos: linux goarch: ppc64le pkg: golang.org/x/crypto/internal/poly1305 cpu: POWER10 │ poly.orig.out │ poly.out │ │ sec/op │ sec/op vs base │ 64 40.34n ± 0% 38.13n ± 0% -5.47% (p=0.002 n=6) 1K 482.2n ± 0% 444.6n ± 0% -7.81% (p=0.002 n=6) 2M 978.4µ ± 0% 879.3µ ± 0% -10.12% (p=0.002 n=6) 64Unaligned 40.35n ± 0% 38.16n ± 0% -5.42% (p=0.002 n=6) 1KUnaligned 482.0n ± 0% 444.2n ± 0% -7.84% (p=0.002 n=6) 2MUnaligned 978.4µ ± 0% 879.4µ ± 0% -10.12% (p=0.002 n=6) Write64 32.69n ± 0% 30.71n ± 0% -6.04% (p=0.002 n=6) Write1K 472.4n ± 0% 436.5n ± 0% -7.60% (p=0.002 n=6) Write2M 978.3µ ± 0% 879.4µ ± 0% -10.11% (p=0.002 n=6) Write64Unaligned 32.67n ± 0% 30.71n ± 0% -6.00% (p=0.002 n=6) Write1KUnaligned 472.6n ± 0% 436.4n ± 0% -7.66% (p=0.002 n=6) Write2MUnaligned 978.5µ ± 0% 879.6µ ± 0% -10.10% (p=0.002 n=6) geomean 2.569µ 2.367µ -7.87% Change-Id: I63314e7252ef10fb2d157f623c4bc2e31a63ae32 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/558775 Reviewed-by: David Chase Reviewed-by: Michael Knyszek LUCI-TryBot-Result: Go LUCI Reviewed-by: Paul Murphy Run-TryBot: Lynn Boger TryBot-Result: Gopher Robot Reviewed-by: Than McIntosh --- internal/poly1305/sum_ppc64le.s | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/internal/poly1305/sum_ppc64le.s b/internal/poly1305/sum_ppc64le.s index d2ca5deeb9..b3c1699bff 100644 --- a/internal/poly1305/sum_ppc64le.s +++ b/internal/poly1305/sum_ppc64le.s @@ -19,15 +19,14 @@ #define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \ MULLD r0, h0, t0; \ - MULLD r0, h1, t4; \ MULHDU r0, h0, t1; \ + MULLD r0, h1, t4; \ MULHDU r0, h1, t5; \ ADDC t4, t1, t1; \ MULLD r0, h2, t2; \ - ADDZE t5; \ MULHDU r1, h0, t4; \ MULLD r1, h0, h0; \ - ADD t5, t2, t2; \ + ADDE t5, t2, t2; \ ADDC h0, t1, t1; \ MULLD h2, r1, t3; \ ADDZE t4, h0; \ @@ -37,13 +36,11 @@ ADDE t5, t3, t3; \ ADDC h0, t2, t2; \ MOVD $-4, t4; \ - MOVD t0, h0; \ - MOVD t1, h1; \ ADDZE t3; \ - ANDCC $3, t2, h2; \ - AND t2, t4, t0; \ + RLDICL $0, t2, $62, h2; \ + AND t2, t4, h0; \ ADDC t0, h0, h0; \ - ADDE t3, h1, h1; \ + ADDE t3, t1, h1; \ SLD $62, t3, t4; \ SRD $2, t2; \ ADDZE h2; \ @@ -75,6 +72,7 @@ TEXT ·update(SB), $0-32 loop: POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22) + PCALIGN $16 multiply: POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21) ADD $-16, R5 From 5bead598a0d4042616ba0daf31d8398ff6c06eaf Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Thu, 15 Feb 2024 19:55:13 -0800 Subject: [PATCH 35/95] ocsp: don't use iota for externally defined constants Style fix: iota shouldn't be used for values that come from RFCs or things that go over the wire or to disk. Change-Id: Ib903ec1bce7e71ff81094db3de8d0b71e16f52f3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564695 Reviewed-by: Dmitri Shuralyov Auto-Submit: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker --- ocsp/ocsp.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ocsp/ocsp.go b/ocsp/ocsp.go index 4269ed113b..bf2259537d 100644 --- a/ocsp/ocsp.go +++ b/ocsp/ocsp.go @@ -279,21 +279,22 @@ func getOIDFromHashAlgorithm(target crypto.Hash) asn1.ObjectIdentifier { // This is the exposed reflection of the internal OCSP structures. -// The status values that can be expressed in OCSP. See RFC 6960. +// The status values that can be expressed in OCSP. See RFC 6960. +// These are used for the Response.Status field. const ( // Good means that the certificate is valid. - Good = iota + Good = 0 // Revoked means that the certificate has been deliberately revoked. - Revoked + Revoked = 1 // Unknown means that the OCSP responder doesn't know about the certificate. - Unknown + Unknown = 2 // ServerFailed is unused and was never used (see // https://go-review.googlesource.com/#/c/18944). ParseResponse will // return a ResponseError when an error response is parsed. - ServerFailed + ServerFailed = 3 ) -// The enumerated reasons for revoking a certificate. See RFC 5280. +// The enumerated reasons for revoking a certificate. See RFC 5280. const ( Unspecified = 0 KeyCompromise = 1 From 0aab8d07aefab378c763e8f36aa007544a862aa9 Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Mon, 26 Feb 2024 09:29:18 -0800 Subject: [PATCH 36/95] all: update go.mod x/net dependency Not necessary for anything, but need to test something harmless. Change-Id: I4faa64ef2c8f1f14df5ccaa37836fc56beed4f50 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/566917 LUCI-TryBot-Result: Go LUCI Auto-Submit: Gopher Robot Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 9cde59c44e..5954e5dbec 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module golang.org/x/crypto go 1.18 require ( - golang.org/x/net v0.10.0 // tagx:ignore + golang.org/x/net v0.21.0 // tagx:ignore golang.org/x/sys v0.17.0 golang.org/x/term v0.17.0 ) diff --git a/go.sum b/go.sum index 55835d583b..70b680b428 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.17.0 h1:mkTF7LCd6WGJNL3K1Ad7kwxNfYAW6a8a8QqtMblp/4U= From 0d2316b26734542fda0a2df69f586b811740edf4 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 18 Feb 2024 15:50:20 +0100 Subject: [PATCH 37/95] ssh/test: work around for TestCiphers failures on macOS TestCiphers is already skipped on macOS when testing data received from the server, so move the test for sending data after the receiving one to work around this new integration test failure. Fixes golang/go#65732 Change-Id: Ie0c614c5373735ae8aefdd7ded643579b130f4b3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564457 Reviewed-by: Bryan Mills Reviewed-by: Carlos Amedee LUCI-TryBot-Result: Go LUCI Auto-Submit: Nicola Murino Commit-Queue: Nicola Murino --- ssh/test/session_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ssh/test/session_test.go b/ssh/test/session_test.go index 7925e95278..e9bfa9ec27 100644 --- a/ssh/test/session_test.go +++ b/ssh/test/session_test.go @@ -357,11 +357,6 @@ func testOneCipher(t *testing.T, cipher string, cipherOrder []string) { numBytes := 4096 - // Exercise sending data to the server - if _, _, err := conn.Conn.SendRequest("drop-me", false, make([]byte, numBytes)); err != nil { - t.Fatalf("SendRequest: %v", err) - } - // Exercise receiving data from the server session, err := conn.NewSession() if err != nil { @@ -377,6 +372,11 @@ func testOneCipher(t *testing.T, cipher string, cipherOrder []string) { if len(out) != numBytes { t.Fatalf("got %d bytes, want %d bytes", len(out), numBytes) } + + // Exercise sending data to the server + if _, _, err := conn.Conn.SendRequest("drop-me", false, make([]byte, numBytes)); err != nil { + t.Fatalf("SendRequest: %v", err) + } } var deprecatedCiphers = []string{ From 7067223927c4e3f3bb91a5c6e0d2aae83df74e7a Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 4 Mar 2024 17:42:56 +0000 Subject: [PATCH 38/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I54256afe881714f60fc37c471cd65b060090d9d6 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/568816 Reviewed-by: Than McIntosh Reviewed-by: Michael Knyszek LUCI-TryBot-Result: Go LUCI Auto-Submit: Gopher Robot --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 5954e5dbec..9dee0fe087 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.17.0 - golang.org/x/term v0.17.0 + golang.org/x/sys v0.18.0 + golang.org/x/term v0.18.0 ) require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index 70b680b428..e8784c3e2e 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= -golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.17.0 h1:mkTF7LCd6WGJNL3K1Ad7kwxNfYAW6a8a8QqtMblp/4U= -golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= +golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= +golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From b91329d961d4ba7270e79ed661eb324c299ab812 Mon Sep 17 00:00:00 2001 From: cui fliter Date: Tue, 5 Mar 2024 13:01:48 +0800 Subject: [PATCH 39/95] all: remove redundant words in comments and fix some typos Change-Id: I3078492dc020770aca630e0b362df0212bd41e32 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/569156 Auto-Submit: Ian Lance Taylor Reviewed-by: David Chase Reviewed-by: Nicola Murino Reviewed-by: Ian Lance Taylor Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI --- internal/testenv/exec.go | 4 ++-- md4/md4.go | 2 +- ssh/certs_test.go | 18 +++++++++--------- ssh/example_test.go | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/internal/testenv/exec.go b/internal/testenv/exec.go index 4bacdc3ce8..df5d3c20df 100644 --- a/internal/testenv/exec.go +++ b/internal/testenv/exec.go @@ -57,8 +57,8 @@ func CommandContext(t testing.TB, ctx context.Context, name string, args ...stri // grace periods to clean up: one for the delay between the first // termination signal being sent (via the Cancel callback when the Context // expires) and the process being forcibly terminated (via the WaitDelay - // field), and a second one for the delay becween the process being - // terminated and and the test logging its output for debugging. + // field), and a second one for the delay between the process being + // terminated and the test logging its output for debugging. // // (We want to ensure that the test process itself has enough time to // log the output before it is also terminated.) diff --git a/md4/md4.go b/md4/md4.go index 59d3480693..d1911c2e86 100644 --- a/md4/md4.go +++ b/md4/md4.go @@ -4,7 +4,7 @@ // Package md4 implements the MD4 hash algorithm as defined in RFC 1320. // -// Deprecated: MD4 is cryptographically broken and should should only be used +// Deprecated: MD4 is cryptographically broken and should only be used // where compatibility with legacy systems, not security, is the goal. Instead, // use a secure hash like SHA-256 (from crypto/sha256). package md4 // import "golang.org/x/crypto/md4" diff --git a/ssh/certs_test.go b/ssh/certs_test.go index 97b7486d0c..66000f19a2 100644 --- a/ssh/certs_test.go +++ b/ssh/certs_test.go @@ -367,21 +367,21 @@ func TestCertTypes(t *testing.T) { func TestCertSignWithMultiAlgorithmSigner(t *testing.T) { type testcase struct { - sigAlgo string - algoritms []string + sigAlgo string + algorithms []string } cases := []testcase{ { - sigAlgo: KeyAlgoRSA, - algoritms: []string{KeyAlgoRSA, KeyAlgoRSASHA512}, + sigAlgo: KeyAlgoRSA, + algorithms: []string{KeyAlgoRSA, KeyAlgoRSASHA512}, }, { - sigAlgo: KeyAlgoRSASHA256, - algoritms: []string{KeyAlgoRSASHA256, KeyAlgoRSA, KeyAlgoRSASHA512}, + sigAlgo: KeyAlgoRSASHA256, + algorithms: []string{KeyAlgoRSASHA256, KeyAlgoRSA, KeyAlgoRSASHA512}, }, { - sigAlgo: KeyAlgoRSASHA512, - algoritms: []string{KeyAlgoRSASHA512, KeyAlgoRSASHA256}, + sigAlgo: KeyAlgoRSASHA512, + algorithms: []string{KeyAlgoRSASHA512, KeyAlgoRSASHA256}, }, } @@ -393,7 +393,7 @@ func TestCertSignWithMultiAlgorithmSigner(t *testing.T) { for _, c := range cases { t.Run(c.sigAlgo, func(t *testing.T) { - signer, err := NewSignerWithAlgorithms(testSigners["rsa"].(AlgorithmSigner), c.algoritms) + signer, err := NewSignerWithAlgorithms(testSigners["rsa"].(AlgorithmSigner), c.algorithms) if err != nil { t.Fatalf("NewSignerWithAlgorithms error: %v", err) } diff --git a/ssh/example_test.go b/ssh/example_test.go index 3920832c1a..97b3b6aba6 100644 --- a/ssh/example_test.go +++ b/ssh/example_test.go @@ -384,7 +384,7 @@ func ExampleCertificate_SignCert() { } mas, err := ssh.NewSignerWithAlgorithms(signer.(ssh.AlgorithmSigner), []string{ssh.KeyAlgoRSASHA256}) if err != nil { - log.Fatal("unable to create signer with algoritms: ", err) + log.Fatal("unable to create signer with algorithms: ", err) } certificate := ssh.Certificate{ Key: publicKey, From 8d0d405eedb9e8f9d0d381a5e85235684df0a868 Mon Sep 17 00:00:00 2001 From: Jayanth Krishnamurthy Date: Mon, 19 Feb 2024 13:17:46 -0600 Subject: [PATCH 40/95] x/crypto/chacha20: cleanup chacha_ppc64le.s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adding PCALIGN before the loops - Changing WORD directive with corresponding Vector Merge EVEN/ODD word instructions - Replacing Branch Conditional (BC) with its extended mnemonic form BDNZ - VPERMXOR instruction usage in place of VXOR instructions followed by VRLW (rotate left) for cases of rotating in multiples of 8. This replacements give performace improvement both in time and space of around 7%-8% as listed below using benchstat tool. goos: linux goarch: ppc64le pkg: golang.org/x/crypto/chacha20 cpu: POWER10 | chacha20.prev.out | chacha20.new.out | | sec/op | sec/op vs base | ChaCha20/64 171.9n ± 0% 156.6n ± 1% -8.90% (p=0.002 n=6) ChaCha20/256 165.5n ± 0% 152.4n ± 0% -7.92% (p=0.002 n=6) ChaCha20/10x25 505.8n ± 0% 504.3n ± 2% -0.32% (p=0.589 n=6) ChaCha20/4096 2.265µ ± 0% 2.052µ ± 0% -9.40% (p=0.002 n=6) ChaCha20/100x40 5.359µ ± 3% 5.018µ ± 2% -6.37% (p=0.002 n=6) ChaCha20/65536 35.71µ ± 0% 32.29µ ± 0% -9.57% (p=0.002 n=6) ChaCha20/1000x65 44.63µ ± 0% 41.05µ ± 0% -8.02% (p=0.002 n=6) geomean 2.235µ 2.073µ -7.26% | chacha20.prev.out | chacha20.new.out | | B/s | B/s vs base | ChaCha20/64 355.1Mi ± 0% 389.8Mi ± 1% +9.78% (p=0.002 n=6) ChaCha20/256 1.440Gi ± 0% 1.565Gi ± 0% +8.62% (p=0.002 n=6) ChaCha20/10x25 471.3Mi ± 0% 472.8Mi ± 2% +0.31% (p=0.589 n=6) ChaCha20/4096 1.684Gi ± 0% 1.859Gi ± 0% +10.38% (p=0.002 n=6) ChaCha20/100x40 711.8Mi ± 3% 760.3Mi ± 2% +6.80% (p=0.002 n=6) ChaCha20/65536 1.709Gi ± 0% 1.890Gi ± 0% +10.59% (p=0.002 n=6) ChaCha20/1000x65 1.356Gi ± 0% 1.475Gi ± 0% +8.72% (p=0.002 n=6) geomean 957.3Mi 1.008Gi +7.83% Change-Id: Ib31cb10a2a11eacdacf0272fbfd887eb5ccd8bcb Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564797 Reviewed-by: Lynn Boger Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Run-TryBot: Lynn Boger Reviewed-by: Cherry Mui --- chacha20/chacha_ppc64le.s | 110 ++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 58 deletions(-) diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64le.s index 66aebae258..c672ccf698 100644 --- a/chacha20/chacha_ppc64le.s +++ b/chacha20/chacha_ppc64le.s @@ -33,6 +33,9 @@ #define CONSTBASE R16 #define BLOCKS R17 +// for VPERMXOR +#define MASK R18 + DATA consts<>+0x00(SB)/8, $0x3320646e61707865 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 DATA consts<>+0x10(SB)/8, $0x0000000000000001 @@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 DATA consts<>+0x90(SB)/8, $0x0000000100000000 DATA consts<>+0x98(SB)/8, $0x0000000300000002 -GLOBL consts<>(SB), RODATA, $0xa0 +DATA consts<>+0xa0(SB)/8, $0x5566774411223300 +DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 +DATA consts<>+0xb0(SB)/8, $0x6677445522330011 +DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +GLOBL consts<>(SB), RODATA, $0xc0 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 @@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $48, R10 MOVD $64, R11 SRD $6, LEN, BLOCKS + // for VPERMXOR + MOVD $consts<>+0xa0(SB), MASK + MOVD $16, R20 // V16 LXVW4X (CONSTBASE)(R0), VS48 ADD $80,CONSTBASE @@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // V28 LXVW4X (CONSTBASE)(R11), VS60 + // Load mask constants for VPERMXOR + LXVW4X (MASK)(R0), V20 + LXVW4X (MASK)(R20), V21 + // splat slot from V19 -> V26 VSPLTW $0, V19, V26 @@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $10, R14 MOVD R14, CTR - + PCALIGN $16 loop_outer_vsx: // V0, V1, V2, V3 LXVW4X (R0)(CONSTBASE), VS32 @@ -128,22 +142,17 @@ loop_outer_vsx: VSPLTISW $12, V28 VSPLTISW $8, V29 VSPLTISW $7, V30 - + PCALIGN $16 loop_vsx: VADDUWM V0, V4, V0 VADDUWM V1, V5, V1 VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 - VRLW V15, V27, V15 + VPERMXOR V12, V0, V21, V12 + VPERMXOR V13, V1, V21, V13 + VPERMXOR V14, V2, V21, V14 + VPERMXOR V15, V3, V21, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -165,15 +174,10 @@ loop_vsx: VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 - VRLW V15, V29, V15 + VPERMXOR V12, V0, V20, V12 + VPERMXOR V13, V1, V20, V13 + VPERMXOR V14, V2, V20, V14 + VPERMXOR V15, V3, V20, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -195,15 +199,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V27, V15 - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 + VPERMXOR V15, V0, V21, V15 + VPERMXOR V12, V1, V21, V12 + VPERMXOR V13, V2, V21, V13 + VPERMXOR V14, V3, V21, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -225,15 +224,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V29, V15 - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 + VPERMXOR V15, V0, V20, V15 + VPERMXOR V12, V1, V20, V12 + VPERMXOR V13, V2, V20, V13 + VPERMXOR V14, V3, V20, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -249,48 +243,48 @@ loop_vsx: VRLW V6, V30, V6 VRLW V7, V30, V7 VRLW V4, V30, V4 - BC 16, LT, loop_vsx + BDNZ loop_vsx VADDUWM V12, V26, V12 - WORD $0x13600F8C // VMRGEW V0, V1, V27 - WORD $0x13821F8C // VMRGEW V2, V3, V28 + VMRGEW V0, V1, V27 + VMRGEW V2, V3, V28 - WORD $0x10000E8C // VMRGOW V0, V1, V0 - WORD $0x10421E8C // VMRGOW V2, V3, V2 + VMRGOW V0, V1, V0 + VMRGOW V2, V3, V2 - WORD $0x13A42F8C // VMRGEW V4, V5, V29 - WORD $0x13C63F8C // VMRGEW V6, V7, V30 + VMRGEW V4, V5, V29 + VMRGEW V6, V7, V30 XXPERMDI VS32, VS34, $0, VS33 XXPERMDI VS32, VS34, $3, VS35 XXPERMDI VS59, VS60, $0, VS32 XXPERMDI VS59, VS60, $3, VS34 - WORD $0x10842E8C // VMRGOW V4, V5, V4 - WORD $0x10C63E8C // VMRGOW V6, V7, V6 + VMRGOW V4, V5, V4 + VMRGOW V6, V7, V6 - WORD $0x13684F8C // VMRGEW V8, V9, V27 - WORD $0x138A5F8C // VMRGEW V10, V11, V28 + VMRGEW V8, V9, V27 + VMRGEW V10, V11, V28 XXPERMDI VS36, VS38, $0, VS37 XXPERMDI VS36, VS38, $3, VS39 XXPERMDI VS61, VS62, $0, VS36 XXPERMDI VS61, VS62, $3, VS38 - WORD $0x11084E8C // VMRGOW V8, V9, V8 - WORD $0x114A5E8C // VMRGOW V10, V11, V10 + VMRGOW V8, V9, V8 + VMRGOW V10, V11, V10 - WORD $0x13AC6F8C // VMRGEW V12, V13, V29 - WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + VMRGEW V12, V13, V29 + VMRGEW V14, V15, V30 XXPERMDI VS40, VS42, $0, VS41 XXPERMDI VS40, VS42, $3, VS43 XXPERMDI VS59, VS60, $0, VS40 XXPERMDI VS59, VS60, $3, VS42 - WORD $0x118C6E8C // VMRGOW V12, V13, V12 - WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + VMRGOW V12, V13, V12 + VMRGOW V14, V15, V14 VSPLTISW $4, V27 VADDUWM V26, V27, V26 @@ -431,7 +425,7 @@ tail_vsx: ADD $-1, R11, R12 ADD $-1, INP ADD $-1, OUT - + PCALIGN $16 looptail_vsx: // Copying the result to OUT // in bytes. @@ -439,7 +433,7 @@ looptail_vsx: MOVBZU 1(INP), TMP XOR KEY, TMP, KEY MOVBU KEY, 1(OUT) - BC 16, LT, looptail_vsx + BDNZ looptail_vsx // Clear the stack values STXVW4X VS48, (R11)(R0) From 6f79b5a20cd106b083a02ec17482450d25a1a8f3 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sat, 5 Aug 2023 15:38:29 +0200 Subject: [PATCH 41/95] ssh: add server side multi-step authentication Add support for sending back partial success to the client while handling authentication in the server. This is implemented by a special error that can be returned by any of the authentication methods, which contains the authentication methods to offer next. This patch is based on CL 399075 with some minor changes and the addition of test cases. Fixes golang/go#17889 Fixes golang/go#61447 Fixes golang/go#64974 Co-authored-by: Peter Verraedt Change-Id: I05c8f913bb407d22c2e41c4cbe965e36ab4739b0 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/516355 Reviewed-by: Andrew Lytvynov Reviewed-by: Than McIntosh Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Auto-Submit: Filippo Valsorda --- ssh/server.go | 168 +++++++++----- ssh/server_multi_auth_test.go | 412 ++++++++++++++++++++++++++++++++++ 2 files changed, 530 insertions(+), 50 deletions(-) create mode 100644 ssh/server_multi_auth_test.go diff --git a/ssh/server.go b/ssh/server.go index c2dfe3268c..92d73233ba 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -426,6 +426,35 @@ func (l ServerAuthError) Error() string { return "[" + strings.Join(errs, ", ") + "]" } +// ServerAuthCallbacks defines server-side authentication callbacks. +type ServerAuthCallbacks struct { + // PasswordCallback behaves like [ServerConfig.PasswordCallback]. + PasswordCallback func(conn ConnMetadata, password []byte) (*Permissions, error) + + // PublicKeyCallback behaves like [ServerConfig.PublicKeyCallback]. + PublicKeyCallback func(conn ConnMetadata, key PublicKey) (*Permissions, error) + + // KeyboardInteractiveCallback behaves like [ServerConfig.KeyboardInteractiveCallback]. + KeyboardInteractiveCallback func(conn ConnMetadata, client KeyboardInteractiveChallenge) (*Permissions, error) + + // GSSAPIWithMICConfig behaves like [ServerConfig.GSSAPIWithMICConfig]. + GSSAPIWithMICConfig *GSSAPIWithMICConfig +} + +// PartialSuccessError can be returned by any of the [ServerConfig] +// authentication callbacks to indicate to the client that authentication has +// partially succeeded, but further steps are required. +type PartialSuccessError struct { + // Next defines the authentication callbacks to apply to further steps. The + // available methods communicated to the client are based on the non-nil + // ServerAuthCallbacks fields. + Next ServerAuthCallbacks +} + +func (p *PartialSuccessError) Error() string { + return "ssh: authenticated with partial success" +} + // ErrNoAuth is the error value returned if no // authentication method has been passed yet. This happens as a normal // part of the authentication loop, since the client first tries @@ -441,6 +470,15 @@ func (s *connection) serverAuthenticate(config *ServerConfig) (*Permissions, err authFailures := 0 var authErrs []error var displayedBanner bool + partialSuccessReturned := false + // Set the initial authentication callbacks from the config. They can be + // changed if a PartialSuccessError is returned. + authConfig := ServerAuthCallbacks{ + PasswordCallback: config.PasswordCallback, + PublicKeyCallback: config.PublicKeyCallback, + KeyboardInteractiveCallback: config.KeyboardInteractiveCallback, + GSSAPIWithMICConfig: config.GSSAPIWithMICConfig, + } userAuthLoop: for { @@ -471,6 +509,11 @@ userAuthLoop: return nil, errors.New("ssh: client attempted to negotiate for unknown service: " + userAuthReq.Service) } + if s.user != userAuthReq.User && partialSuccessReturned { + return nil, fmt.Errorf("ssh: client changed the user after a partial success authentication, previous user %q, current user %q", + s.user, userAuthReq.User) + } + s.user = userAuthReq.User if !displayedBanner && config.BannerCallback != nil { @@ -491,20 +534,17 @@ userAuthLoop: switch userAuthReq.Method { case "none": - if config.NoClientAuth { + // We don't allow none authentication after a partial success + // response. + if config.NoClientAuth && !partialSuccessReturned { if config.NoClientAuthCallback != nil { perms, authErr = config.NoClientAuthCallback(s) } else { authErr = nil } } - - // allow initial attempt of 'none' without penalty - if authFailures == 0 { - authFailures-- - } case "password": - if config.PasswordCallback == nil { + if authConfig.PasswordCallback == nil { authErr = errors.New("ssh: password auth not configured") break } @@ -518,17 +558,17 @@ userAuthLoop: return nil, parseError(msgUserAuthRequest) } - perms, authErr = config.PasswordCallback(s, password) + perms, authErr = authConfig.PasswordCallback(s, password) case "keyboard-interactive": - if config.KeyboardInteractiveCallback == nil { + if authConfig.KeyboardInteractiveCallback == nil { authErr = errors.New("ssh: keyboard-interactive auth not configured") break } prompter := &sshClientKeyboardInteractive{s} - perms, authErr = config.KeyboardInteractiveCallback(s, prompter.Challenge) + perms, authErr = authConfig.KeyboardInteractiveCallback(s, prompter.Challenge) case "publickey": - if config.PublicKeyCallback == nil { + if authConfig.PublicKeyCallback == nil { authErr = errors.New("ssh: publickey auth not configured") break } @@ -562,11 +602,18 @@ userAuthLoop: if !ok { candidate.user = s.user candidate.pubKeyData = pubKeyData - candidate.perms, candidate.result = config.PublicKeyCallback(s, pubKey) - if candidate.result == nil && candidate.perms != nil && candidate.perms.CriticalOptions != nil && candidate.perms.CriticalOptions[sourceAddressCriticalOption] != "" { - candidate.result = checkSourceAddress( + candidate.perms, candidate.result = authConfig.PublicKeyCallback(s, pubKey) + _, isPartialSuccessError := candidate.result.(*PartialSuccessError) + + if (candidate.result == nil || isPartialSuccessError) && + candidate.perms != nil && + candidate.perms.CriticalOptions != nil && + candidate.perms.CriticalOptions[sourceAddressCriticalOption] != "" { + if err := checkSourceAddress( s.RemoteAddr(), - candidate.perms.CriticalOptions[sourceAddressCriticalOption]) + candidate.perms.CriticalOptions[sourceAddressCriticalOption]); err != nil { + candidate.result = err + } } cache.add(candidate) } @@ -578,8 +625,8 @@ userAuthLoop: if len(payload) > 0 { return nil, parseError(msgUserAuthRequest) } - - if candidate.result == nil { + _, isPartialSuccessError := candidate.result.(*PartialSuccessError) + if candidate.result == nil || isPartialSuccessError { okMsg := userAuthPubKeyOkMsg{ Algo: algo, PubKey: pubKeyData, @@ -629,11 +676,11 @@ userAuthLoop: perms = candidate.perms } case "gssapi-with-mic": - if config.GSSAPIWithMICConfig == nil { + if authConfig.GSSAPIWithMICConfig == nil { authErr = errors.New("ssh: gssapi-with-mic auth not configured") break } - gssapiConfig := config.GSSAPIWithMICConfig + gssapiConfig := authConfig.GSSAPIWithMICConfig userAuthRequestGSSAPI, err := parseGSSAPIPayload(userAuthReq.Payload) if err != nil { return nil, parseError(msgUserAuthRequest) @@ -689,49 +736,70 @@ userAuthLoop: break userAuthLoop } - authFailures++ - if config.MaxAuthTries > 0 && authFailures >= config.MaxAuthTries { - // If we have hit the max attempts, don't bother sending the - // final SSH_MSG_USERAUTH_FAILURE message, since there are - // no more authentication methods which can be attempted, - // and this message may cause the client to re-attempt - // authentication while we send the disconnect message. - // Continue, and trigger the disconnect at the start of - // the loop. - // - // The SSH specification is somewhat confusing about this, - // RFC 4252 Section 5.1 requires each authentication failure - // be responded to with a respective SSH_MSG_USERAUTH_FAILURE - // message, but Section 4 says the server should disconnect - // after some number of attempts, but it isn't explicit which - // message should take precedence (i.e. should there be a failure - // message than a disconnect message, or if we are going to - // disconnect, should we only send that message.) - // - // Either way, OpenSSH disconnects immediately after the last - // failed authnetication attempt, and given they are typically - // considered the golden implementation it seems reasonable - // to match that behavior. - continue + var failureMsg userAuthFailureMsg + + if partialSuccess, ok := authErr.(*PartialSuccessError); ok { + // After a partial success error we don't allow changing the user + // name and execute the NoClientAuthCallback. + partialSuccessReturned = true + + // In case a partial success is returned, the server may send + // a new set of authentication methods. + authConfig = partialSuccess.Next + + // Reset pubkey cache, as the new PublicKeyCallback might + // accept a different set of public keys. + cache = pubKeyCache{} + + // Send back a partial success message to the user. + failureMsg.PartialSuccess = true + } else { + // Allow initial attempt of 'none' without penalty. + if authFailures > 0 || userAuthReq.Method != "none" { + authFailures++ + } + if config.MaxAuthTries > 0 && authFailures >= config.MaxAuthTries { + // If we have hit the max attempts, don't bother sending the + // final SSH_MSG_USERAUTH_FAILURE message, since there are + // no more authentication methods which can be attempted, + // and this message may cause the client to re-attempt + // authentication while we send the disconnect message. + // Continue, and trigger the disconnect at the start of + // the loop. + // + // The SSH specification is somewhat confusing about this, + // RFC 4252 Section 5.1 requires each authentication failure + // be responded to with a respective SSH_MSG_USERAUTH_FAILURE + // message, but Section 4 says the server should disconnect + // after some number of attempts, but it isn't explicit which + // message should take precedence (i.e. should there be a failure + // message than a disconnect message, or if we are going to + // disconnect, should we only send that message.) + // + // Either way, OpenSSH disconnects immediately after the last + // failed authnetication attempt, and given they are typically + // considered the golden implementation it seems reasonable + // to match that behavior. + continue + } } - var failureMsg userAuthFailureMsg - if config.PasswordCallback != nil { + if authConfig.PasswordCallback != nil { failureMsg.Methods = append(failureMsg.Methods, "password") } - if config.PublicKeyCallback != nil { + if authConfig.PublicKeyCallback != nil { failureMsg.Methods = append(failureMsg.Methods, "publickey") } - if config.KeyboardInteractiveCallback != nil { + if authConfig.KeyboardInteractiveCallback != nil { failureMsg.Methods = append(failureMsg.Methods, "keyboard-interactive") } - if config.GSSAPIWithMICConfig != nil && config.GSSAPIWithMICConfig.Server != nil && - config.GSSAPIWithMICConfig.AllowLogin != nil { + if authConfig.GSSAPIWithMICConfig != nil && authConfig.GSSAPIWithMICConfig.Server != nil && + authConfig.GSSAPIWithMICConfig.AllowLogin != nil { failureMsg.Methods = append(failureMsg.Methods, "gssapi-with-mic") } if len(failureMsg.Methods) == 0 { - return nil, errors.New("ssh: no authentication methods configured but NoClientAuth is also false") + return nil, errors.New("ssh: no authentication methods available") } if err := s.transport.writePacket(Marshal(&failureMsg)); err != nil { diff --git a/ssh/server_multi_auth_test.go b/ssh/server_multi_auth_test.go new file mode 100644 index 0000000000..3b39802437 --- /dev/null +++ b/ssh/server_multi_auth_test.go @@ -0,0 +1,412 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssh + +import ( + "bytes" + "errors" + "fmt" + "strings" + "testing" +) + +func doClientServerAuth(t *testing.T, serverConfig *ServerConfig, clientConfig *ClientConfig) ([]error, error) { + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + + var serverAuthErrors []error + + serverConfig.AddHostKey(testSigners["rsa"]) + serverConfig.AuthLogCallback = func(conn ConnMetadata, method string, err error) { + serverAuthErrors = append(serverAuthErrors, err) + } + go newServer(c1, serverConfig) + c, _, _, err := NewClientConn(c2, "", clientConfig) + if err == nil { + c.Close() + } + return serverAuthErrors, err +} + +func TestMultiStepAuth(t *testing.T) { + // This user can login with password, public key or public key + password. + username := "testuser" + // This user can login with public key + password only. + usernameSecondFactor := "testuser_second_factor" + errPwdAuthFailed := errors.New("password auth failed") + errWrongSequence := errors.New("wrong sequence") + + serverConfig := &ServerConfig{ + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == usernameSecondFactor { + return nil, errWrongSequence + } + if conn.User() == username && string(password) == clientPassword { + return nil, nil + } + return nil, errPwdAuthFailed + }, + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + if bytes.Equal(key.Marshal(), testPublicKeys["rsa"].Marshal()) { + if conn.User() == usernameSecondFactor { + return nil, &PartialSuccessError{ + Next: ServerAuthCallbacks{ + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if string(password) == clientPassword { + return nil, nil + } + return nil, errPwdAuthFailed + }, + }, + } + } + return nil, nil + } + return nil, fmt.Errorf("pubkey for %q not acceptable", conn.User()) + }, + } + + clientConfig := &ClientConfig{ + User: usernameSecondFactor, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err := doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + + // The error sequence is: + // - no auth passed yet + // - partial success + // - nil + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatalf("expected partial success error, got: %v", serverAuthErrors[1]) + } + // Now test a wrong sequence. + clientConfig.Auth = []AuthMethod{ + Password(clientPassword), + PublicKeys(testSigners["rsa"]), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with wrong sequence must fail") + } + // The error sequence is: + // - no auth passed yet + // - wrong sequence + // - partial success + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if serverAuthErrors[1] != errWrongSequence { + t.Fatal("server not returned wrong sequence") + } + if _, ok := serverAuthErrors[2].(*PartialSuccessError); !ok { + t.Fatalf("expected partial success error, got: %v", serverAuthErrors[2]) + } + // Now test using a correct sequence but a wrong password before the right + // one. + n := 0 + passwords := []string{"WRONG", "WRONG", clientPassword} + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + RetryableAuthMethod(PasswordCallback(func() (string, error) { + p := passwords[n] + n++ + return p, nil + }), 3), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - no auth passed yet + // - partial success + // - wrong password + // - wrong password + // - nil + if len(serverAuthErrors) != 5 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + if serverAuthErrors[2] != errPwdAuthFailed { + t.Fatal("server not returned password authentication failed") + } + if serverAuthErrors[3] != errPwdAuthFailed { + t.Fatal("server not returned password authentication failed") + } + // Only password authentication should fail. + clientConfig.Auth = []AuthMethod{ + Password(clientPassword), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with password only must fail") + } + // The error sequence is: + // - no auth passed yet + // - wrong sequence + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if serverAuthErrors[1] != errWrongSequence { + t.Fatal("server not returned wrong sequence") + } + + // Only public key authentication should fail. + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with public key only must fail") + } + // The error sequence is: + // - no auth passed yet + // - partial success + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + // Public key and wrong password. + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + Password("WRONG"), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with wrong password after public key must fail") + } + // The error sequence is: + // - no auth passed yet + // - partial success + // - password auth failed + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + if serverAuthErrors[2] != errPwdAuthFailed { + t.Fatal("server not returned password authentication failed") + } + + // Public key, public key again and then correct password. Public key + // authentication is attempted only once because the partial success error + // returns only "password" as the allowed authentication method. + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + PublicKeys(testSigners["rsa"]), + Password(clientPassword), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - no auth passed yet + // - partial success + // - nil + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + // The unrestricted username can do anything + clientConfig = &ClientConfig{ + User: username, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("unrestricted client login error: %s", err) + } + + clientConfig = &ClientConfig{ + User: username, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("unrestricted client login error: %s", err) + } + + clientConfig = &ClientConfig{ + User: username, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("unrestricted client login error: %s", err) + } +} + +func TestDynamicAuthCallbacks(t *testing.T) { + user1 := "user1" + user2 := "user2" + errInvalidCredentials := errors.New("invalid credentials") + + serverConfig := &ServerConfig{ + NoClientAuth: true, + NoClientAuthCallback: func(conn ConnMetadata) (*Permissions, error) { + switch conn.User() { + case user1: + return nil, &PartialSuccessError{ + Next: ServerAuthCallbacks{ + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == user1 && string(password) == clientPassword { + return nil, nil + } + return nil, errInvalidCredentials + }, + }, + } + case user2: + return nil, &PartialSuccessError{ + Next: ServerAuthCallbacks{ + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + if bytes.Equal(key.Marshal(), testPublicKeys["rsa"].Marshal()) { + if conn.User() == user2 { + return nil, nil + } + } + return nil, errInvalidCredentials + }, + }, + } + default: + return nil, errInvalidCredentials + } + }, + } + + clientConfig := &ClientConfig{ + User: user1, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err := doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - partial success + // - nil + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + clientConfig = &ClientConfig{ + User: user2, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - partial success + // - nil + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + // user1 cannot login with public key + clientConfig = &ClientConfig{ + User: user1, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("user1 login with public key must fail") + } + if !strings.Contains(err.Error(), "no supported methods remain") { + t.Errorf("got %v, expected 'no supported methods remain'", err) + } + if len(serverAuthErrors) != 1 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + // user2 cannot login with password + clientConfig = &ClientConfig{ + User: user2, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("user2 login with password must fail") + } + if !strings.Contains(err.Error(), "no supported methods remain") { + t.Errorf("got %v, expected 'no supported methods remain'", err) + } + if len(serverAuthErrors) != 1 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } +} From b92bf9480d25521fc2b8965f86fe152bdfb43f28 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Wed, 3 Apr 2024 11:21:09 +0200 Subject: [PATCH 42/95] ssh: respect MaxAuthTries also for "none" auth attempts Only the first "none" auth attempt is allowed without penality Change-Id: Ibe776e968ba406445eeb94e8f1959383b88c98f7 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/575995 Reviewed-by: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Nicola Murino Commit-Queue: Nicola Murino Reviewed-by: Than McIntosh --- ssh/server.go | 6 ++- ssh/server_test.go | 129 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 2 deletions(-) diff --git a/ssh/server.go b/ssh/server.go index 92d73233ba..e2ae4f891b 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -468,6 +468,7 @@ func (s *connection) serverAuthenticate(config *ServerConfig) (*Permissions, err var perms *Permissions authFailures := 0 + noneAuthCount := 0 var authErrs []error var displayedBanner bool partialSuccessReturned := false @@ -534,6 +535,7 @@ userAuthLoop: switch userAuthReq.Method { case "none": + noneAuthCount++ // We don't allow none authentication after a partial success // response. if config.NoClientAuth && !partialSuccessReturned { @@ -755,7 +757,7 @@ userAuthLoop: failureMsg.PartialSuccess = true } else { // Allow initial attempt of 'none' without penalty. - if authFailures > 0 || userAuthReq.Method != "none" { + if authFailures > 0 || userAuthReq.Method != "none" || noneAuthCount != 1 { authFailures++ } if config.MaxAuthTries > 0 && authFailures >= config.MaxAuthTries { @@ -777,7 +779,7 @@ userAuthLoop: // disconnect, should we only send that message.) // // Either way, OpenSSH disconnects immediately after the last - // failed authnetication attempt, and given they are typically + // failed authentication attempt, and given they are typically // considered the golden implementation it seems reasonable // to match that behavior. continue diff --git a/ssh/server_test.go b/ssh/server_test.go index a9b2bce0c1..5b47b9e0af 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -5,8 +5,10 @@ package ssh import ( + "errors" "io" "net" + "strings" "sync/atomic" "testing" "time" @@ -62,6 +64,133 @@ func TestClientAuthRestrictedPublicKeyAlgos(t *testing.T) { } } +func TestMaxAuthTriesNoneMethod(t *testing.T) { + username := "testuser" + serverConfig := &ServerConfig{ + MaxAuthTries: 2, + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == username && string(password) == clientPassword { + return nil, nil + } + return nil, errors.New("invalid credentials") + }, + } + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + + var serverAuthErrors []error + + serverConfig.AddHostKey(testSigners["rsa"]) + serverConfig.AuthLogCallback = func(conn ConnMetadata, method string, err error) { + serverAuthErrors = append(serverAuthErrors, err) + } + go newServer(c1, serverConfig) + + clientConfig := ClientConfig{ + User: username, + HostKeyCallback: InsecureIgnoreHostKey(), + } + clientConfig.SetDefaults() + // Our client will send 'none' auth only once, so we need to send the + // requests manually. + c := &connection{ + sshConn: sshConn{ + conn: c2, + user: username, + clientVersion: []byte(packageVersion), + }, + } + c.serverVersion, err = exchangeVersions(c.sshConn.conn, c.clientVersion) + if err != nil { + t.Fatalf("unable to exchange version: %v", err) + } + c.transport = newClientTransport( + newTransport(c.sshConn.conn, clientConfig.Rand, true /* is client */), + c.clientVersion, c.serverVersion, &clientConfig, "", c.sshConn.RemoteAddr()) + if err := c.transport.waitSession(); err != nil { + t.Fatalf("unable to wait session: %v", err) + } + c.sessionID = c.transport.getSessionID() + if err := c.transport.writePacket(Marshal(&serviceRequestMsg{serviceUserAuth})); err != nil { + t.Fatalf("unable to send ssh-userauth message: %v", err) + } + packet, err := c.transport.readPacket() + if err != nil { + t.Fatal(err) + } + if len(packet) > 0 && packet[0] == msgExtInfo { + packet, err = c.transport.readPacket() + if err != nil { + t.Fatal(err) + } + } + var serviceAccept serviceAcceptMsg + if err := Unmarshal(packet, &serviceAccept); err != nil { + t.Fatal(err) + } + for i := 0; i <= serverConfig.MaxAuthTries; i++ { + auth := new(noneAuth) + _, _, err := auth.auth(c.sessionID, clientConfig.User, c.transport, clientConfig.Rand, nil) + if i < serverConfig.MaxAuthTries { + if err != nil { + t.Fatal(err) + } + continue + } + if err == nil { + t.Fatal("client: got no error") + } else if !strings.Contains(err.Error(), "too many authentication failures") { + t.Fatalf("client: got unexpected error: %v", err) + } + } + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + for _, err := range serverAuthErrors { + if !errors.Is(err, ErrNoAuth) { + t.Errorf("go error: %v; want: %v", err, ErrNoAuth) + } + } +} + +func TestMaxAuthTriesFirstNoneAuthErrorIgnored(t *testing.T) { + username := "testuser" + serverConfig := &ServerConfig{ + MaxAuthTries: 1, + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == username && string(password) == clientPassword { + return nil, nil + } + return nil, errors.New("invalid credentials") + }, + } + clientConfig := &ClientConfig{ + User: username, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err := doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if !errors.Is(serverAuthErrors[0], ErrNoAuth) { + t.Errorf("go error: %v; want: %v", serverAuthErrors[0], ErrNoAuth) + } + if serverAuthErrors[1] != nil { + t.Errorf("unexpected error: %v", serverAuthErrors[1]) + } +} + func TestNewServerConnValidationErrors(t *testing.T) { serverConf := &ServerConfig{ PublicKeyAuthAlgorithms: []string{CertAlgoRSAv01}, From d042a396a6de487c29b6907508ba7e86925f6e09 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Thu, 4 Apr 2024 16:13:36 +0000 Subject: [PATCH 43/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Ib20227810c7e72942dd6fc33731fc613784aedec Reviewed-on: https://go-review.googlesource.com/c/crypto/+/576515 Reviewed-by: Than McIntosh Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Auto-Submit: Gopher Robot --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 9dee0fe087..4a12d7a426 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.18.0 - golang.org/x/term v0.18.0 + golang.org/x/sys v0.19.0 + golang.org/x/term v0.19.0 ) require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index e8784c3e2e..fab3a4a74d 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= -golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= -golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= +golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 5defcc193aabc79299b09bc1e2e30445a3f78d4e Mon Sep 17 00:00:00 2001 From: Michael Munday Date: Sun, 14 Apr 2024 00:53:16 +0100 Subject: [PATCH 44/95] sha3: fix Sum results for SHAKE functions on s390x Sum was taking the digest from the state which is correct for SHA-3 functions but not for SHAKE functions. Updates golang/go#66804 Change-Id: If782464d773262075950e3168128c0d46e4a6530 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/578715 TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui Reviewed-by: Than McIntosh LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Run-TryBot: Michael Munday --- sha3/sha3_s390x.go | 19 +++++++++++++++++-- sha3/sha3_test.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go index d861bca528..b4fbbf8695 100644 --- a/sha3/sha3_s390x.go +++ b/sha3/sha3_s390x.go @@ -143,6 +143,12 @@ func (s *asmState) Write(b []byte) (int, error) { // Read squeezes an arbitrary number of bytes from the sponge. func (s *asmState) Read(out []byte) (n int, err error) { + // The 'compute last message digest' instruction only stores the digest + // at the first operand (dst) for SHAKE functions. + if s.function != shake_128 && s.function != shake_256 { + panic("sha3: can only call Read for SHAKE functions") + } + n = len(out) // need to pad if we were absorbing @@ -202,8 +208,17 @@ func (s *asmState) Sum(b []byte) []byte { // Hash the buffer. Note that we don't clear it because we // aren't updating the state. - klmd(s.function, &a, nil, s.buf) - return append(b, a[:s.outputLen]...) + switch s.function { + case sha3_224, sha3_256, sha3_384, sha3_512: + klmd(s.function, &a, nil, s.buf) + return append(b, a[:s.outputLen]...) + case shake_128, shake_256: + d := make([]byte, s.outputLen, 64) + klmd(s.function, &a, d, s.buf) + return append(b, d[:s.outputLen]...) + default: + panic("sha3: unknown function") + } } // Reset resets the Hash to its initial state. diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index 83bd6195d6..afcb722e7a 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -188,6 +188,34 @@ func TestKeccak(t *testing.T) { } } +// TestShakeSum tests that the output of Sum matches the output of Read. +func TestShakeSum(t *testing.T) { + tests := [...]struct { + name string + hash ShakeHash + expectedLen int + }{ + {"SHAKE128", NewShake128(), 32}, + {"SHAKE256", NewShake256(), 64}, + {"cSHAKE128", NewCShake128([]byte{'X'}, nil), 32}, + {"cSHAKE256", NewCShake256([]byte{'X'}, nil), 64}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := test.hash.Sum(nil) + if len(s) != test.expectedLen { + t.Errorf("Unexpected digest length: got %d, want %d", len(s), test.expectedLen) + } + r := make([]byte, test.expectedLen) + test.hash.Read(r) + if !bytes.Equal(s, r) { + t.Errorf("Mismatch between Sum and Read:\nSum: %s\nRead: %s", hex.EncodeToString(s), hex.EncodeToString(r)) + } + }) + } +} + // TestUnalignedWrite tests that writing data in an arbitrary pattern with // small input buffers. func TestUnalignedWrite(t *testing.T) { From 0da2a6a1bbc8e689a335bea68b5cc0e3e8728854 Mon Sep 17 00:00:00 2001 From: cuishuang Date: Tue, 16 Apr 2024 17:44:44 +0800 Subject: [PATCH 45/95] openpgp: fix function name in comment Change-Id: Ic788ebe311fafa0f5d9750d5f7f25fb70dc0606d Reviewed-on: https://go-review.googlesource.com/c/crypto/+/579175 Run-TryBot: shuang cui Auto-Submit: Ian Lance Taylor TryBot-Result: Gopher Robot Reviewed-by: Ian Lance Taylor LUCI-TryBot-Result: Go LUCI Reviewed-by: Cherry Mui --- openpgp/keys_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openpgp/keys_test.go b/openpgp/keys_test.go index 0eb1a9ef29..9631eb6408 100644 --- a/openpgp/keys_test.go +++ b/openpgp/keys_test.go @@ -132,7 +132,7 @@ func TestRevokedUserID(t *testing.T) { } } -// TestExternallyRevokableKey attempts to load and parse a key with a third party revocation permission. +// TestExternallyRevocableKey attempts to load and parse a key with a third party revocation permission. func TestExternallyRevocableKey(t *testing.T) { kring, err := ReadKeyRing(readerFromHex(subkeyUsageHex)) if err != nil { From ebb717d630028d3e29c90c55d73cb6de90d53c3e Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sat, 23 Mar 2024 12:10:24 +0100 Subject: [PATCH 46/95] ssh: validate key type in SSH_MSG_USERAUTH_PK_OK response According to RFC 4252 Section 7 the algorithm in SSH_MSG_USERAUTH_PK_OK should match that of the request but some servers send the key type instead. OpenSSH checks for the key type, so we do the same. Fixes golang/go#66438 Fixes golang/go#64785 Fixes golang/go#56342 Fixes golang/go#54027 Change-Id: I2f733f0faece097e44ba7a97c868d30a53e21d79 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/573360 Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI Run-TryBot: Nicola Murino Reviewed-by: Roland Shoemaker Reviewed-by: Filippo Valsorda TryBot-Result: Gopher Robot Reviewed-by: Joedian Reid --- ssh/client_auth.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ssh/client_auth.go b/ssh/client_auth.go index 34bf089d0b..9486c59862 100644 --- a/ssh/client_auth.go +++ b/ssh/client_auth.go @@ -404,10 +404,10 @@ func validateKey(key PublicKey, algo string, user string, c packetConn) (bool, e return false, err } - return confirmKeyAck(key, algo, c) + return confirmKeyAck(key, c) } -func confirmKeyAck(key PublicKey, algo string, c packetConn) (bool, error) { +func confirmKeyAck(key PublicKey, c packetConn) (bool, error) { pubKey := key.Marshal() for { @@ -425,7 +425,15 @@ func confirmKeyAck(key PublicKey, algo string, c packetConn) (bool, error) { if err := Unmarshal(packet, &msg); err != nil { return false, err } - if msg.Algo != algo || !bytes.Equal(msg.PubKey, pubKey) { + // According to RFC 4252 Section 7 the algorithm in + // SSH_MSG_USERAUTH_PK_OK should match that of the request but some + // servers send the key type instead. OpenSSH allows any algorithm + // that matches the public key, so we do the same. + // https://github.com/openssh/openssh-portable/blob/86bdd385/sshconnect2.c#L709 + if !contains(algorithmsForKeyFormat(key.Type()), msg.Algo) { + return false, nil + } + if !bytes.Equal(msg.PubKey, pubKey) { return false, nil } return true, nil From 905d78a692675acab06328af80cdfe0b681c8fc7 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Sun, 5 May 2024 13:11:57 +0000 Subject: [PATCH 47/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I19d5fc3e26b53fba06b4fbcf3817c44477265210 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/583355 Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Than McIntosh Reviewed-by: Dmitri Shuralyov --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 4a12d7a426..e5e51cb287 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.19.0 - golang.org/x/term v0.19.0 + golang.org/x/sys v0.20.0 + golang.org/x/term v0.20.0 ) -require golang.org/x/text v0.14.0 // indirect +require golang.org/x/text v0.15.0 // indirect diff --git a/go.sum b/go.sum index fab3a4a74d..ea7e57f729 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= -golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 10f366e7a2b3254f25277b2c11f89b3f26fb8df1 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 24 Nov 2023 01:16:03 +0100 Subject: [PATCH 48/95] sha3: simplify XOR functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta PermutationFunction-4 398ns ± 0% 399ns ± 1% ~ (p=0.508 n=9+10) Sha3_512_MTU-4 8.34µs ± 1% 8.36µs ± 1% ~ (p=0.101 n=10+10) Sha3_384_MTU-4 6.00µs ± 0% 6.02µs ± 1% +0.47% (p=0.000 n=8+10) Sha3_256_MTU-4 4.78µs ± 0% 4.79µs ± 1% ~ (p=0.324 n=10+10) Sha3_224_MTU-4 4.57µs ± 1% 4.57µs ± 1% ~ (p=0.288 n=10+10) Shake128_MTU-4 3.87µs ± 0% 3.86µs ± 1% -0.22% (p=0.008 n=9+9) Shake256_MTU-4 4.17µs ± 0% 4.17µs ± 0% ~ (p=0.474 n=10+8) Shake256_16x-4 59.4µs ± 0% 59.7µs ± 0% +0.48% (p=0.000 n=9+8) Shake256_1MiB-4 3.19ms ± 1% 3.20ms ± 0% ~ (p=0.105 n=10+10) Sha3_512_1MiB-4 5.97ms ± 0% 6.01ms ± 0% +0.75% (p=0.000 n=10+10) name old speed new speed delta PermutationFunction-4 502MB/s ± 0% 502MB/s ± 0% ~ (p=0.497 n=9+10) Sha3_512_MTU-4 162MB/s ± 1% 161MB/s ± 1% ~ (p=0.101 n=10+10) Sha3_384_MTU-4 225MB/s ± 0% 224MB/s ± 1% -0.47% (p=0.000 n=8+10) Sha3_256_MTU-4 282MB/s ± 0% 282MB/s ± 1% ~ (p=0.325 n=10+10) Sha3_224_MTU-4 296MB/s ± 1% 295MB/s ± 1% ~ (p=0.280 n=10+10) Shake128_MTU-4 349MB/s ± 0% 350MB/s ± 1% +0.22% (p=0.008 n=9+9) Shake256_MTU-4 324MB/s ± 0% 324MB/s ± 0% ~ (p=0.459 n=10+8) Shake256_16x-4 276MB/s ± 0% 274MB/s ± 0% -0.48% (p=0.000 n=9+8) Shake256_1MiB-4 328MB/s ± 1% 327MB/s ± 0% ~ (p=0.105 n=10+10) Sha3_512_1MiB-4 176MB/s ± 0% 174MB/s ± 0% -0.74% (p=0.000 n=10+10) Change-Id: Ib8e571f3c9a0f84096df2f38ca96da197ad5be30 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/544815 Auto-Submit: Filippo Valsorda Reviewed-by: Roland Shoemaker Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Reviewed-by: Mauri de Souza Meneguzzo --- sha3/sha3.go | 20 +-- sha3/sha3_test.go | 295 ++++++++++++++++++++---------------------- sha3/xor.go | 45 +++++-- sha3/xor_generic.go | 28 ---- sha3/xor_unaligned.go | 66 ---------- 5 files changed, 178 insertions(+), 276 deletions(-) delete mode 100644 sha3/xor_generic.go delete mode 100644 sha3/xor_unaligned.go diff --git a/sha3/sha3.go b/sha3/sha3.go index 4884d172a4..33bd73b0f6 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -40,7 +40,7 @@ type state struct { // Extendable-Output Functions (May 2014)" dsbyte byte - storage storageBuf + storage [maxRate]byte // Specific to SHA-3 and SHAKE. outputLen int // the default output size in bytes @@ -61,15 +61,15 @@ func (d *state) Reset() { d.a[i] = 0 } d.state = spongeAbsorbing - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] } func (d *state) clone() *state { ret := *d if ret.state == spongeAbsorbing { - ret.buf = ret.storage.asBytes()[:len(ret.buf)] + ret.buf = ret.storage[:len(ret.buf)] } else { - ret.buf = ret.storage.asBytes()[d.rate-cap(d.buf) : d.rate] + ret.buf = ret.storage[d.rate-cap(d.buf) : d.rate] } return &ret @@ -83,13 +83,13 @@ func (d *state) permute() { // If we're absorbing, we need to xor the input into the state // before applying the permutation. xorIn(d, d.buf) - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] keccakF1600(&d.a) case spongeSqueezing: // If we're squeezing, we need to apply the permutation before // copying more output. keccakF1600(&d.a) - d.buf = d.storage.asBytes()[:d.rate] + d.buf = d.storage[:d.rate] copyOut(d, d.buf) } } @@ -98,7 +98,7 @@ func (d *state) permute() { // the multi-bitrate 10..1 padding rule, and permutes the state. func (d *state) padAndPermute(dsbyte byte) { if d.buf == nil { - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] } // Pad with this instance's domain-separator bits. We know that there's // at least one byte of space in d.buf because, if it were full, @@ -106,7 +106,7 @@ func (d *state) padAndPermute(dsbyte byte) { // first one bit for the padding. See the comment in the state struct. d.buf = append(d.buf, dsbyte) zerosStart := len(d.buf) - d.buf = d.storage.asBytes()[:d.rate] + d.buf = d.storage[:d.rate] for i := zerosStart; i < d.rate; i++ { d.buf[i] = 0 } @@ -117,7 +117,7 @@ func (d *state) padAndPermute(dsbyte byte) { // Apply the permutation d.permute() d.state = spongeSqueezing - d.buf = d.storage.asBytes()[:d.rate] + d.buf = d.storage[:d.rate] copyOut(d, d.buf) } @@ -128,7 +128,7 @@ func (d *state) Write(p []byte) (written int, err error) { panic("sha3: Write after Read") } if d.buf == nil { - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] } written = len(p) diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index afcb722e7a..21e8cbad7b 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -76,86 +76,73 @@ type KeccakKats struct { } } -func testUnalignedAndGeneric(t *testing.T, testf func(impl string)) { - xorInOrig, copyOutOrig := xorIn, copyOut - xorIn, copyOut = xorInGeneric, copyOutGeneric - testf("generic") - if xorImplementationUnaligned != "generic" { - xorIn, copyOut = xorInUnaligned, copyOutUnaligned - testf("unaligned") - } - xorIn, copyOut = xorInOrig, copyOutOrig -} - // TestKeccakKats tests the SHA-3 and Shake implementations against all the // ShortMsgKATs from https://github.com/gvanas/KeccakCodePackage // (The testvectors are stored in keccakKats.json.deflate due to their length.) func TestKeccakKats(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - // Read the KATs. - deflated, err := os.Open(katFilename) - if err != nil { - t.Errorf("error opening %s: %s", katFilename, err) - } - file := flate.NewReader(deflated) - dec := json.NewDecoder(file) - var katSet KeccakKats - err = dec.Decode(&katSet) - if err != nil { - t.Errorf("error decoding KATs: %s", err) - } + // Read the KATs. + deflated, err := os.Open(katFilename) + if err != nil { + t.Errorf("error opening %s: %s", katFilename, err) + } + file := flate.NewReader(deflated) + dec := json.NewDecoder(file) + var katSet KeccakKats + err = dec.Decode(&katSet) + if err != nil { + t.Errorf("error decoding KATs: %s", err) + } - for algo, function := range testDigests { - d := function() - for _, kat := range katSet.Kats[algo] { - d.Reset() - in, err := hex.DecodeString(kat.Message) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } - d.Write(in[:kat.Length/8]) - got := strings.ToUpper(hex.EncodeToString(d.Sum(nil))) - if got != kat.Digest { - t.Errorf("function=%s, implementation=%s, length=%d\nmessage:\n %s\ngot:\n %s\nwanted:\n %s", - algo, impl, kat.Length, kat.Message, got, kat.Digest) - t.Logf("wanted %+v", kat) - t.FailNow() - } - continue + for algo, function := range testDigests { + d := function() + for _, kat := range katSet.Kats[algo] { + d.Reset() + in, err := hex.DecodeString(kat.Message) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } + d.Write(in[:kat.Length/8]) + got := strings.ToUpper(hex.EncodeToString(d.Sum(nil))) + if got != kat.Digest { + t.Errorf("function=%s, length=%d\nmessage:\n %s\ngot:\n %s\nwanted:\n %s", + algo, kat.Length, kat.Message, got, kat.Digest) + t.Logf("wanted %+v", kat) + t.FailNow() } + continue } + } - for algo, v := range testShakes { - for _, kat := range katSet.Kats[algo] { - N, err := hex.DecodeString(kat.N) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } + for algo, v := range testShakes { + for _, kat := range katSet.Kats[algo] { + N, err := hex.DecodeString(kat.N) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } - S, err := hex.DecodeString(kat.S) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } - d := v.constructor(N, S) - in, err := hex.DecodeString(kat.Message) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } + S, err := hex.DecodeString(kat.S) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } + d := v.constructor(N, S) + in, err := hex.DecodeString(kat.Message) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } - d.Write(in[:kat.Length/8]) - out := make([]byte, len(kat.Digest)/2) - d.Read(out) - got := strings.ToUpper(hex.EncodeToString(out)) - if got != kat.Digest { - t.Errorf("function=%s, implementation=%s, length=%d N:%s\n S:%s\nmessage:\n %s \ngot:\n %s\nwanted:\n %s", - algo, impl, kat.Length, kat.N, kat.S, kat.Message, got, kat.Digest) - t.Logf("wanted %+v", kat) - t.FailNow() - } - continue + d.Write(in[:kat.Length/8]) + out := make([]byte, len(kat.Digest)/2) + d.Read(out) + got := strings.ToUpper(hex.EncodeToString(out)) + if got != kat.Digest { + t.Errorf("function=%s, length=%d N:%s\n S:%s\nmessage:\n %s \ngot:\n %s\nwanted:\n %s", + algo, kat.Length, kat.N, kat.S, kat.Message, got, kat.Digest) + t.Logf("wanted %+v", kat) + t.FailNow() } + continue } - }) + } } // TestKeccak does a basic test of the non-standardized Keccak hash functions. @@ -219,119 +206,111 @@ func TestShakeSum(t *testing.T) { // TestUnalignedWrite tests that writing data in an arbitrary pattern with // small input buffers. func TestUnalignedWrite(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - buf := sequentialBytes(0x10000) - for alg, df := range testDigests { - d := df() - d.Reset() - d.Write(buf) - want := d.Sum(nil) - d.Reset() - for i := 0; i < len(buf); { - // Cycle through offsets which make a 137 byte sequence. - // Because 137 is prime this sequence should exercise all corner cases. - offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} - for _, j := range offsets { - if v := len(buf) - i; v < j { - j = v - } - d.Write(buf[i : i+j]) - i += j + buf := sequentialBytes(0x10000) + for alg, df := range testDigests { + d := df() + d.Reset() + d.Write(buf) + want := d.Sum(nil) + d.Reset() + for i := 0; i < len(buf); { + // Cycle through offsets which make a 137 byte sequence. + // Because 137 is prime this sequence should exercise all corner cases. + offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} + for _, j := range offsets { + if v := len(buf) - i; v < j { + j = v } - } - got := d.Sum(nil) - if !bytes.Equal(got, want) { - t.Errorf("Unaligned writes, implementation=%s, alg=%s\ngot %q, want %q", impl, alg, got, want) + d.Write(buf[i : i+j]) + i += j } } + got := d.Sum(nil) + if !bytes.Equal(got, want) { + t.Errorf("Unaligned writes, alg=%s\ngot %q, want %q", alg, got, want) + } + } - // Same for SHAKE - for alg, df := range testShakes { - want := make([]byte, 16) - got := make([]byte, 16) - d := df.constructor([]byte(df.defAlgoName), []byte(df.defCustomStr)) - - d.Reset() - d.Write(buf) - d.Read(want) - d.Reset() - for i := 0; i < len(buf); { - // Cycle through offsets which make a 137 byte sequence. - // Because 137 is prime this sequence should exercise all corner cases. - offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} - for _, j := range offsets { - if v := len(buf) - i; v < j { - j = v - } - d.Write(buf[i : i+j]) - i += j + // Same for SHAKE + for alg, df := range testShakes { + want := make([]byte, 16) + got := make([]byte, 16) + d := df.constructor([]byte(df.defAlgoName), []byte(df.defCustomStr)) + + d.Reset() + d.Write(buf) + d.Read(want) + d.Reset() + for i := 0; i < len(buf); { + // Cycle through offsets which make a 137 byte sequence. + // Because 137 is prime this sequence should exercise all corner cases. + offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} + for _, j := range offsets { + if v := len(buf) - i; v < j { + j = v } - } - d.Read(got) - if !bytes.Equal(got, want) { - t.Errorf("Unaligned writes, implementation=%s, alg=%s\ngot %q, want %q", impl, alg, got, want) + d.Write(buf[i : i+j]) + i += j } } - }) + d.Read(got) + if !bytes.Equal(got, want) { + t.Errorf("Unaligned writes, alg=%s\ngot %q, want %q", alg, got, want) + } + } } // TestAppend checks that appending works when reallocation is necessary. func TestAppend(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - d := New224() + d := New224() - for capacity := 2; capacity <= 66; capacity += 64 { - // The first time around the loop, Sum will have to reallocate. - // The second time, it will not. - buf := make([]byte, 2, capacity) - d.Reset() - d.Write([]byte{0xcc}) - buf = d.Sum(buf) - expected := "0000DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" - if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { - t.Errorf("got %s, want %s", got, expected) - } + for capacity := 2; capacity <= 66; capacity += 64 { + // The first time around the loop, Sum will have to reallocate. + // The second time, it will not. + buf := make([]byte, 2, capacity) + d.Reset() + d.Write([]byte{0xcc}) + buf = d.Sum(buf) + expected := "0000DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" + if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { + t.Errorf("got %s, want %s", got, expected) } - }) + } } // TestAppendNoRealloc tests that appending works when no reallocation is necessary. func TestAppendNoRealloc(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - buf := make([]byte, 1, 200) - d := New224() - d.Write([]byte{0xcc}) - buf = d.Sum(buf) - expected := "00DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" - if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { - t.Errorf("%s: got %s, want %s", impl, got, expected) - } - }) + buf := make([]byte, 1, 200) + d := New224() + d.Write([]byte{0xcc}) + buf = d.Sum(buf) + expected := "00DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" + if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { + t.Errorf("got %s, want %s", got, expected) + } } // TestSqueezing checks that squeezing the full output a single time produces // the same output as repeatedly squeezing the instance. func TestSqueezing(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - for algo, v := range testShakes { - d0 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) - d0.Write([]byte(testString)) - ref := make([]byte, 32) - d0.Read(ref) - - d1 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) - d1.Write([]byte(testString)) - var multiple []byte - for range ref { - one := make([]byte, 1) - d1.Read(one) - multiple = append(multiple, one...) - } - if !bytes.Equal(ref, multiple) { - t.Errorf("%s (%s): squeezing %d bytes one at a time failed", algo, impl, len(ref)) - } + for algo, v := range testShakes { + d0 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) + d0.Write([]byte(testString)) + ref := make([]byte, 32) + d0.Read(ref) + + d1 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) + d1.Write([]byte(testString)) + var multiple []byte + for range ref { + one := make([]byte, 1) + d1.Read(one) + multiple = append(multiple, one...) + } + if !bytes.Equal(ref, multiple) { + t.Errorf("%s: squeezing %d bytes one at a time failed", algo, len(ref)) } - }) + } } // sequentialBytes produces a buffer of size consecutive bytes 0x00, 0x01, ..., used for testing. diff --git a/sha3/xor.go b/sha3/xor.go index 7337cca88e..6ada5c9574 100644 --- a/sha3/xor.go +++ b/sha3/xor.go @@ -2,22 +2,39 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !386 && !ppc64le) || purego - package sha3 -// A storageBuf is an aligned array of maxRate bytes. -type storageBuf [maxRate]byte - -func (b *storageBuf) asBytes() *[maxRate]byte { - return (*[maxRate]byte)(b) -} +import ( + "crypto/subtle" + "encoding/binary" + "unsafe" -var ( - xorIn = xorInGeneric - copyOut = copyOutGeneric - xorInUnaligned = xorInGeneric - copyOutUnaligned = copyOutGeneric + "golang.org/x/sys/cpu" ) -const xorImplementationUnaligned = "generic" +// xorIn xors the bytes in buf into the state. +func xorIn(d *state, buf []byte) { + if cpu.IsBigEndian { + for i := 0; len(buf) >= 8; i++ { + a := binary.LittleEndian.Uint64(buf) + d.a[i] ^= a + buf = buf[8:] + } + } else { + ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) + subtle.XORBytes(ab[:], ab[:], buf) + } +} + +// copyOut copies uint64s to a byte buffer. +func copyOut(d *state, b []byte) { + if cpu.IsBigEndian { + for i := 0; len(b) >= 8; i++ { + binary.LittleEndian.PutUint64(b, d.a[i]) + b = b[8:] + } + } else { + ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) + copy(b, ab[:]) + } +} diff --git a/sha3/xor_generic.go b/sha3/xor_generic.go deleted file mode 100644 index 8d94771127..0000000000 --- a/sha3/xor_generic.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package sha3 - -import "encoding/binary" - -// xorInGeneric xors the bytes in buf into the state; it -// makes no non-portable assumptions about memory layout -// or alignment. -func xorInGeneric(d *state, buf []byte) { - n := len(buf) / 8 - - for i := 0; i < n; i++ { - a := binary.LittleEndian.Uint64(buf) - d.a[i] ^= a - buf = buf[8:] - } -} - -// copyOutGeneric copies uint64s to a byte buffer. -func copyOutGeneric(d *state, b []byte) { - for i := 0; len(b) >= 8; i++ { - binary.LittleEndian.PutUint64(b, d.a[i]) - b = b[8:] - } -} diff --git a/sha3/xor_unaligned.go b/sha3/xor_unaligned.go deleted file mode 100644 index 870e2d16e0..0000000000 --- a/sha3/xor_unaligned.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (amd64 || 386 || ppc64le) && !purego - -package sha3 - -import "unsafe" - -// A storageBuf is an aligned array of maxRate bytes. -type storageBuf [maxRate / 8]uint64 - -func (b *storageBuf) asBytes() *[maxRate]byte { - return (*[maxRate]byte)(unsafe.Pointer(b)) -} - -// xorInUnaligned uses unaligned reads and writes to update d.a to contain d.a -// XOR buf. -func xorInUnaligned(d *state, buf []byte) { - n := len(buf) - bw := (*[maxRate / 8]uint64)(unsafe.Pointer(&buf[0]))[: n/8 : n/8] - if n >= 72 { - d.a[0] ^= bw[0] - d.a[1] ^= bw[1] - d.a[2] ^= bw[2] - d.a[3] ^= bw[3] - d.a[4] ^= bw[4] - d.a[5] ^= bw[5] - d.a[6] ^= bw[6] - d.a[7] ^= bw[7] - d.a[8] ^= bw[8] - } - if n >= 104 { - d.a[9] ^= bw[9] - d.a[10] ^= bw[10] - d.a[11] ^= bw[11] - d.a[12] ^= bw[12] - } - if n >= 136 { - d.a[13] ^= bw[13] - d.a[14] ^= bw[14] - d.a[15] ^= bw[15] - d.a[16] ^= bw[16] - } - if n >= 144 { - d.a[17] ^= bw[17] - } - if n >= 168 { - d.a[18] ^= bw[18] - d.a[19] ^= bw[19] - d.a[20] ^= bw[20] - } -} - -func copyOutUnaligned(d *state, buf []byte) { - ab := (*[maxRate]uint8)(unsafe.Pointer(&d.a[0])) - copy(buf, ab[:]) -} - -var ( - xorIn = xorInUnaligned - copyOut = copyOutUnaligned -) - -const xorImplementationUnaligned = "unaligned" From 59b5a86796b9d310b31d416f56d93b5ce30da22b Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 24 Nov 2023 01:20:48 +0100 Subject: [PATCH 49/95] sha3: disable s390x assembly It was integrated in such a way that it made devirtualization impossible, leading to allocations on every platform. It can be reintroduced according to AssemblyPolicy and TargetSpecific. Updates #64897 Change-Id: I3a4edc91185c2928b2c9b80655a2bc8daa6b44e3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/544816 Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Mauri de Souza Meneguzzo Auto-Submit: Filippo Valsorda Reviewed-by: Cherry Mui --- sha3/hashes.go | 12 ------------ sha3/hashes_generic.go | 27 --------------------------- sha3/sha3_s390x.go | 2 +- sha3/sha3_s390x.s | 2 +- sha3/shake.go | 6 ------ sha3/shake_generic.go | 19 ------------------- 6 files changed, 2 insertions(+), 66 deletions(-) delete mode 100644 sha3/hashes_generic.go delete mode 100644 sha3/shake_generic.go diff --git a/sha3/hashes.go b/sha3/hashes.go index 0d8043fd2a..1e815c9c7a 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -16,9 +16,6 @@ import ( // Its generic security strength is 224 bits against preimage attacks, // and 112 bits against collision attacks. func New224() hash.Hash { - if h := new224Asm(); h != nil { - return h - } return &state{rate: 144, outputLen: 28, dsbyte: 0x06} } @@ -26,9 +23,6 @@ func New224() hash.Hash { // Its generic security strength is 256 bits against preimage attacks, // and 128 bits against collision attacks. func New256() hash.Hash { - if h := new256Asm(); h != nil { - return h - } return &state{rate: 136, outputLen: 32, dsbyte: 0x06} } @@ -36,9 +30,6 @@ func New256() hash.Hash { // Its generic security strength is 384 bits against preimage attacks, // and 192 bits against collision attacks. func New384() hash.Hash { - if h := new384Asm(); h != nil { - return h - } return &state{rate: 104, outputLen: 48, dsbyte: 0x06} } @@ -46,9 +37,6 @@ func New384() hash.Hash { // Its generic security strength is 512 bits against preimage attacks, // and 256 bits against collision attacks. func New512() hash.Hash { - if h := new512Asm(); h != nil { - return h - } return &state{rate: 72, outputLen: 64, dsbyte: 0x06} } diff --git a/sha3/hashes_generic.go b/sha3/hashes_generic.go deleted file mode 100644 index fe8c84793c..0000000000 --- a/sha3/hashes_generic.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !gc || purego || !s390x - -package sha3 - -import ( - "hash" -) - -// new224Asm returns an assembly implementation of SHA3-224 if available, -// otherwise it returns nil. -func new224Asm() hash.Hash { return nil } - -// new256Asm returns an assembly implementation of SHA3-256 if available, -// otherwise it returns nil. -func new256Asm() hash.Hash { return nil } - -// new384Asm returns an assembly implementation of SHA3-384 if available, -// otherwise it returns nil. -func new384Asm() hash.Hash { return nil } - -// new512Asm returns an assembly implementation of SHA3-512 if available, -// otherwise it returns nil. -func new512Asm() hash.Hash { return nil } diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go index b4fbbf8695..26b728b836 100644 --- a/sha3/sha3_s390x.go +++ b/sha3/sha3_s390x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && ignore package sha3 diff --git a/sha3/sha3_s390x.s b/sha3/sha3_s390x.s index 826b862c77..df51683097 100644 --- a/sha3/sha3_s390x.s +++ b/sha3/sha3_s390x.s @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && ignore #include "textflag.h" diff --git a/sha3/shake.go b/sha3/shake.go index bb69984027..a31bcf898c 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -115,9 +115,6 @@ func (c *state) Clone() ShakeHash { // Its generic security strength is 128 bits against all attacks if at // least 32 bytes of its output are used. func NewShake128() ShakeHash { - if h := newShake128Asm(); h != nil { - return h - } return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} } @@ -125,9 +122,6 @@ func NewShake128() ShakeHash { // Its generic security strength is 256 bits against all attacks if // at least 64 bytes of its output are used. func NewShake256() ShakeHash { - if h := newShake256Asm(); h != nil { - return h - } return &state{rate: rate256, outputLen: 64, dsbyte: dsbyteShake} } diff --git a/sha3/shake_generic.go b/sha3/shake_generic.go deleted file mode 100644 index 8d31cf5be2..0000000000 --- a/sha3/shake_generic.go +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !gc || purego || !s390x - -package sha3 - -// newShake128Asm returns an assembly implementation of SHAKE-128 if available, -// otherwise it returns nil. -func newShake128Asm() ShakeHash { - return nil -} - -// newShake256Asm returns an assembly implementation of SHAKE-256 if available, -// otherwise it returns nil. -func newShake256Asm() ShakeHash { - return nil -} From 477a5b4c327a4fea3cab2fe127f89940289b65e5 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 24 Nov 2023 01:25:35 +0100 Subject: [PATCH 50/95] sha3: make APIs usable with zero allocations The "buf points into storage" pattern is nice, but causes the whole state struct to escape, since escape analysis can't track the pointer once it's assigned to buf. Change-Id: I31c0e83f946d66bedb5a180e96ab5d5e936eb322 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/544817 Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker Reviewed-by: Mauri de Souza Meneguzzo Auto-Submit: Filippo Valsorda --- sha3/allocations_test.go | 53 +++++++++++++++++++++++++++++++++++ sha3/sha3.go | 60 ++++++++++++++++------------------------ 2 files changed, 77 insertions(+), 36 deletions(-) create mode 100644 sha3/allocations_test.go diff --git a/sha3/allocations_test.go b/sha3/allocations_test.go new file mode 100644 index 0000000000..c925099304 --- /dev/null +++ b/sha3/allocations_test.go @@ -0,0 +1,53 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !noopt + +package sha3_test + +import ( + "testing" + + "golang.org/x/crypto/sha3" +) + +var sink byte + +func TestAllocations(t *testing.T) { + t.Run("New", func(t *testing.T) { + if allocs := testing.AllocsPerRun(10, func() { + h := sha3.New256() + b := []byte("ABC") + h.Write(b) + out := make([]byte, 0, 32) + out = h.Sum(out) + sink ^= out[0] + }); allocs > 0 { + t.Errorf("expected zero allocations, got %0.1f", allocs) + } + }) + t.Run("NewShake", func(t *testing.T) { + if allocs := testing.AllocsPerRun(10, func() { + h := sha3.NewShake128() + b := []byte("ABC") + h.Write(b) + out := make([]byte, 0, 32) + out = h.Sum(out) + sink ^= out[0] + h.Read(out) + sink ^= out[0] + }); allocs > 0 { + t.Errorf("expected zero allocations, got %0.1f", allocs) + } + }) + t.Run("Sum", func(t *testing.T) { + if allocs := testing.AllocsPerRun(10, func() { + b := []byte("ABC") + out := sha3.Sum256(b) + sink ^= out[0] + }); allocs > 0 { + t.Errorf("expected zero allocations, got %0.1f", allocs) + } + }) +} diff --git a/sha3/sha3.go b/sha3/sha3.go index 33bd73b0f6..afedde5abf 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -23,7 +23,6 @@ const ( type state struct { // Generic sponge components. a [25]uint64 // main state of the hash - buf []byte // points into storage rate int // the number of bytes of state to use // dsbyte contains the "domain separation" bits and the first bit of @@ -40,6 +39,7 @@ type state struct { // Extendable-Output Functions (May 2014)" dsbyte byte + i, n int // storage[i:n] is the buffer, i is only used while squeezing storage [maxRate]byte // Specific to SHA-3 and SHAKE. @@ -54,24 +54,18 @@ func (d *state) BlockSize() int { return d.rate } func (d *state) Size() int { return d.outputLen } // Reset clears the internal state by zeroing the sponge state and -// the byte buffer, and setting Sponge.state to absorbing. +// the buffer indexes, and setting Sponge.state to absorbing. func (d *state) Reset() { // Zero the permutation's state. for i := range d.a { d.a[i] = 0 } d.state = spongeAbsorbing - d.buf = d.storage[:0] + d.i, d.n = 0, 0 } func (d *state) clone() *state { ret := *d - if ret.state == spongeAbsorbing { - ret.buf = ret.storage[:len(ret.buf)] - } else { - ret.buf = ret.storage[d.rate-cap(d.buf) : d.rate] - } - return &ret } @@ -82,43 +76,40 @@ func (d *state) permute() { case spongeAbsorbing: // If we're absorbing, we need to xor the input into the state // before applying the permutation. - xorIn(d, d.buf) - d.buf = d.storage[:0] + xorIn(d, d.storage[:d.rate]) + d.n = 0 keccakF1600(&d.a) case spongeSqueezing: // If we're squeezing, we need to apply the permutation before // copying more output. keccakF1600(&d.a) - d.buf = d.storage[:d.rate] - copyOut(d, d.buf) + d.i = 0 + copyOut(d, d.storage[:d.rate]) } } // pads appends the domain separation bits in dsbyte, applies // the multi-bitrate 10..1 padding rule, and permutes the state. -func (d *state) padAndPermute(dsbyte byte) { - if d.buf == nil { - d.buf = d.storage[:0] - } +func (d *state) padAndPermute() { // Pad with this instance's domain-separator bits. We know that there's // at least one byte of space in d.buf because, if it were full, // permute would have been called to empty it. dsbyte also contains the // first one bit for the padding. See the comment in the state struct. - d.buf = append(d.buf, dsbyte) - zerosStart := len(d.buf) - d.buf = d.storage[:d.rate] - for i := zerosStart; i < d.rate; i++ { - d.buf[i] = 0 + d.storage[d.n] = d.dsbyte + d.n++ + for d.n < d.rate { + d.storage[d.n] = 0 + d.n++ } // This adds the final one bit for the padding. Because of the way that // bits are numbered from the LSB upwards, the final bit is the MSB of // the last byte. - d.buf[d.rate-1] ^= 0x80 + d.storage[d.rate-1] ^= 0x80 // Apply the permutation d.permute() d.state = spongeSqueezing - d.buf = d.storage[:d.rate] - copyOut(d, d.buf) + d.n = d.rate + copyOut(d, d.storage[:d.rate]) } // Write absorbs more data into the hash's state. It panics if any @@ -127,28 +118,25 @@ func (d *state) Write(p []byte) (written int, err error) { if d.state != spongeAbsorbing { panic("sha3: Write after Read") } - if d.buf == nil { - d.buf = d.storage[:0] - } written = len(p) for len(p) > 0 { - if len(d.buf) == 0 && len(p) >= d.rate { + if d.n == 0 && len(p) >= d.rate { // The fast path; absorb a full "rate" bytes of input and apply the permutation. xorIn(d, p[:d.rate]) p = p[d.rate:] keccakF1600(&d.a) } else { // The slow path; buffer the input until we can fill the sponge, and then xor it in. - todo := d.rate - len(d.buf) + todo := d.rate - d.n if todo > len(p) { todo = len(p) } - d.buf = append(d.buf, p[:todo]...) + d.n += copy(d.storage[d.n:], p[:todo]) p = p[todo:] // If the sponge is full, apply the permutation. - if len(d.buf) == d.rate { + if d.n == d.rate { d.permute() } } @@ -161,19 +149,19 @@ func (d *state) Write(p []byte) (written int, err error) { func (d *state) Read(out []byte) (n int, err error) { // If we're still absorbing, pad and apply the permutation. if d.state == spongeAbsorbing { - d.padAndPermute(d.dsbyte) + d.padAndPermute() } n = len(out) // Now, do the squeezing. for len(out) > 0 { - n := copy(out, d.buf) - d.buf = d.buf[n:] + n := copy(out, d.storage[d.i:d.n]) + d.i += n out = out[n:] // Apply the permutation if we've squeezed the sponge dry. - if len(d.buf) == 0 { + if d.i == d.rate { d.permute() } } From 67b13616a59528f2f948f405d79d6e7df0b97d12 Mon Sep 17 00:00:00 2001 From: Mauri de Souza Meneguzzo Date: Sat, 6 Jan 2024 18:52:35 +0000 Subject: [PATCH 51/95] sha3: reenable s390x assembly Fixes golang/go#64897 Change-Id: I0c8c52d73a7d2df0f44fee36d407a87213f59bff Reviewed-on: https://go-review.googlesource.com/c/crypto/+/554435 TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI --- sha3/allocations_test.go | 14 ++++++++--- sha3/hashes.go | 22 +++++++++++++++--- sha3/hashes_noasm.go | 23 ++++++++++++++++++ sha3/sha3_s390x.go | 50 ++++++++++++++++++++-------------------- sha3/sha3_s390x.s | 2 +- sha3/shake.go | 10 +++++++- sha3/shake_noasm.go | 15 ++++++++++++ 7 files changed, 103 insertions(+), 33 deletions(-) create mode 100644 sha3/hashes_noasm.go create mode 100644 sha3/shake_noasm.go diff --git a/sha3/allocations_test.go b/sha3/allocations_test.go index c925099304..36de5d547e 100644 --- a/sha3/allocations_test.go +++ b/sha3/allocations_test.go @@ -7,6 +7,7 @@ package sha3_test import ( + "runtime" "testing" "golang.org/x/crypto/sha3" @@ -15,6 +16,13 @@ import ( var sink byte func TestAllocations(t *testing.T) { + want := 0.0 + + if runtime.GOARCH == "s390x" { + // On s390x the returned hash.Hash is conditional so it escapes. + want = 3.0 + } + t.Run("New", func(t *testing.T) { if allocs := testing.AllocsPerRun(10, func() { h := sha3.New256() @@ -23,7 +31,7 @@ func TestAllocations(t *testing.T) { out := make([]byte, 0, 32) out = h.Sum(out) sink ^= out[0] - }); allocs > 0 { + }); allocs > want { t.Errorf("expected zero allocations, got %0.1f", allocs) } }) @@ -37,7 +45,7 @@ func TestAllocations(t *testing.T) { sink ^= out[0] h.Read(out) sink ^= out[0] - }); allocs > 0 { + }); allocs > want { t.Errorf("expected zero allocations, got %0.1f", allocs) } }) @@ -46,7 +54,7 @@ func TestAllocations(t *testing.T) { b := []byte("ABC") out := sha3.Sum256(b) sink ^= out[0] - }); allocs > 0 { + }); allocs > want { t.Errorf("expected zero allocations, got %0.1f", allocs) } }) diff --git a/sha3/hashes.go b/sha3/hashes.go index 1e815c9c7a..5eae6cb922 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -16,27 +16,43 @@ import ( // Its generic security strength is 224 bits against preimage attacks, // and 112 bits against collision attacks. func New224() hash.Hash { - return &state{rate: 144, outputLen: 28, dsbyte: 0x06} + return new224() } // New256 creates a new SHA3-256 hash. // Its generic security strength is 256 bits against preimage attacks, // and 128 bits against collision attacks. func New256() hash.Hash { - return &state{rate: 136, outputLen: 32, dsbyte: 0x06} + return new256() } // New384 creates a new SHA3-384 hash. // Its generic security strength is 384 bits against preimage attacks, // and 192 bits against collision attacks. func New384() hash.Hash { - return &state{rate: 104, outputLen: 48, dsbyte: 0x06} + return new384() } // New512 creates a new SHA3-512 hash. // Its generic security strength is 512 bits against preimage attacks, // and 256 bits against collision attacks. func New512() hash.Hash { + return new512() +} + +func new224Generic() *state { + return &state{rate: 144, outputLen: 28, dsbyte: 0x06} +} + +func new256Generic() *state { + return &state{rate: 136, outputLen: 32, dsbyte: 0x06} +} + +func new384Generic() *state { + return &state{rate: 104, outputLen: 48, dsbyte: 0x06} +} + +func new512Generic() *state { return &state{rate: 72, outputLen: 64, dsbyte: 0x06} } diff --git a/sha3/hashes_noasm.go b/sha3/hashes_noasm.go new file mode 100644 index 0000000000..9d85fb6214 --- /dev/null +++ b/sha3/hashes_noasm.go @@ -0,0 +1,23 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !gc || purego || !s390x + +package sha3 + +func new224() *state { + return new224Generic() +} + +func new256() *state { + return new256Generic() +} + +func new384() *state { + return new384Generic() +} + +func new512() *state { + return new512Generic() +} diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go index 26b728b836..00d8034ae6 100644 --- a/sha3/sha3_s390x.go +++ b/sha3/sha3_s390x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego && ignore +//go:build gc && !purego package sha3 @@ -248,56 +248,56 @@ func (s *asmState) Clone() ShakeHash { return s.clone() } -// new224Asm returns an assembly implementation of SHA3-224 if available, -// otherwise it returns nil. -func new224Asm() hash.Hash { +// new224 returns an assembly implementation of SHA3-224 if available, +// otherwise it returns a generic implementation. +func new224() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_224) } - return nil + return new224Generic() } -// new256Asm returns an assembly implementation of SHA3-256 if available, -// otherwise it returns nil. -func new256Asm() hash.Hash { +// new256 returns an assembly implementation of SHA3-256 if available, +// otherwise it returns a generic implementation. +func new256() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_256) } - return nil + return new256Generic() } -// new384Asm returns an assembly implementation of SHA3-384 if available, -// otherwise it returns nil. -func new384Asm() hash.Hash { +// new384 returns an assembly implementation of SHA3-384 if available, +// otherwise it returns a generic implementation. +func new384() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_384) } - return nil + return new384Generic() } -// new512Asm returns an assembly implementation of SHA3-512 if available, -// otherwise it returns nil. -func new512Asm() hash.Hash { +// new512 returns an assembly implementation of SHA3-512 if available, +// otherwise it returns a generic implementation. +func new512() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_512) } - return nil + return new512Generic() } -// newShake128Asm returns an assembly implementation of SHAKE-128 if available, -// otherwise it returns nil. -func newShake128Asm() ShakeHash { +// newShake128 returns an assembly implementation of SHAKE-128 if available, +// otherwise it returns a generic implementation. +func newShake128() ShakeHash { if cpu.S390X.HasSHA3 { return newAsmState(shake_128) } - return nil + return newShake128Generic() } -// newShake256Asm returns an assembly implementation of SHAKE-256 if available, -// otherwise it returns nil. -func newShake256Asm() ShakeHash { +// newShake256 returns an assembly implementation of SHAKE-256 if available, +// otherwise it returns a generic implementation. +func newShake256() ShakeHash { if cpu.S390X.HasSHA3 { return newAsmState(shake_256) } - return nil + return newShake256Generic() } diff --git a/sha3/sha3_s390x.s b/sha3/sha3_s390x.s index df51683097..826b862c77 100644 --- a/sha3/sha3_s390x.s +++ b/sha3/sha3_s390x.s @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego && ignore +//go:build gc && !purego #include "textflag.h" diff --git a/sha3/shake.go b/sha3/shake.go index a31bcf898c..1ea9275b8b 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -115,13 +115,21 @@ func (c *state) Clone() ShakeHash { // Its generic security strength is 128 bits against all attacks if at // least 32 bytes of its output are used. func NewShake128() ShakeHash { - return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} + return newShake128() } // NewShake256 creates a new SHAKE256 variable-output-length ShakeHash. // Its generic security strength is 256 bits against all attacks if // at least 64 bytes of its output are used. func NewShake256() ShakeHash { + return newShake256() +} + +func newShake128Generic() *state { + return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} +} + +func newShake256Generic() *state { return &state{rate: rate256, outputLen: 64, dsbyte: dsbyteShake} } diff --git a/sha3/shake_noasm.go b/sha3/shake_noasm.go new file mode 100644 index 0000000000..4276ba4ab2 --- /dev/null +++ b/sha3/shake_noasm.go @@ -0,0 +1,15 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !gc || purego || !s390x + +package sha3 + +func newShake128() *state { + return newShake128Generic() +} + +func newShake256() *state { + return newShake256Generic() +} From 44c9b0ff9e71f015c49f686c68a7950fac76623c Mon Sep 17 00:00:00 2001 From: Andrew Lytvynov Date: Thu, 25 Jan 2024 18:32:22 -0700 Subject: [PATCH 52/95] ssh: allow server auth callbacks to send additional banners Add a new BannerError error type that auth callbacks can return to send banner to the client. While the BannerCallback can send the initial banner message, auth callbacks might want to communicate more information to the client to help them diagnose failures. Updates golang/go#64962 Change-Id: I97a26480ff4064b95a0a26042b0a5e19737cfb62 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/558695 LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker Reviewed-by: Nicola Murino Auto-Submit: Nicola Murino Reviewed-by: Dmitri Shuralyov --- ssh/server.go | 30 +++++++++++++++++++ ssh/server_test.go | 74 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/ssh/server.go b/ssh/server.go index e2ae4f891b..3ca9e89e22 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -462,6 +462,24 @@ func (p *PartialSuccessError) Error() string { // It is returned in ServerAuthError.Errors from NewServerConn. var ErrNoAuth = errors.New("ssh: no auth passed yet") +// BannerError is an error that can be returned by authentication handlers in +// ServerConfig to send a banner message to the client. +type BannerError struct { + Err error + Message string +} + +func (b *BannerError) Unwrap() error { + return b.Err +} + +func (b *BannerError) Error() string { + if b.Err == nil { + return b.Message + } + return b.Err.Error() +} + func (s *connection) serverAuthenticate(config *ServerConfig) (*Permissions, error) { sessionID := s.transport.getSessionID() var cache pubKeyCache @@ -734,6 +752,18 @@ userAuthLoop: config.AuthLogCallback(s, userAuthReq.Method, authErr) } + var bannerErr *BannerError + if errors.As(authErr, &bannerErr) { + if bannerErr.Message != "" { + bannerMsg := &userAuthBannerMsg{ + Message: bannerErr.Message, + } + if err := s.transport.writePacket(Marshal(bannerMsg)); err != nil { + return nil, err + } + } + } + if authErr == nil { break userAuthLoop } diff --git a/ssh/server_test.go b/ssh/server_test.go index 5b47b9e0af..9057a9b5f0 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -6,8 +6,10 @@ package ssh import ( "errors" + "fmt" "io" "net" + "slices" "strings" "sync/atomic" "testing" @@ -225,6 +227,78 @@ func TestNewServerConnValidationErrors(t *testing.T) { } } +func TestBannerError(t *testing.T) { + serverConfig := &ServerConfig{ + BannerCallback: func(ConnMetadata) string { + return "banner from BannerCallback" + }, + NoClientAuth: true, + NoClientAuthCallback: func(ConnMetadata) (*Permissions, error) { + err := &BannerError{ + Err: errors.New("error from NoClientAuthCallback"), + Message: "banner from NoClientAuthCallback", + } + return nil, fmt.Errorf("wrapped: %w", err) + }, + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + return &Permissions{}, nil + }, + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + return nil, &BannerError{ + Err: errors.New("error from PublicKeyCallback"), + Message: "banner from PublicKeyCallback", + } + }, + KeyboardInteractiveCallback: func(conn ConnMetadata, client KeyboardInteractiveChallenge) (*Permissions, error) { + return nil, &BannerError{ + Err: nil, // make sure that a nil inner error is allowed + Message: "banner from KeyboardInteractiveCallback", + } + }, + } + serverConfig.AddHostKey(testSigners["rsa"]) + + var banners []string + clientConfig := &ClientConfig{ + User: "test", + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + KeyboardInteractive(func(name, instruction string, questions []string, echos []bool) ([]string, error) { + return []string{"letmein"}, nil + }), + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + BannerCallback: func(msg string) error { + banners = append(banners, msg) + return nil + }, + } + + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + go newServer(c1, serverConfig) + c, _, _, err := NewClientConn(c2, "", clientConfig) + if err != nil { + t.Fatalf("client connection failed: %v", err) + } + defer c.Close() + + wantBanners := []string{ + "banner from BannerCallback", + "banner from NoClientAuthCallback", + "banner from PublicKeyCallback", + "banner from KeyboardInteractiveCallback", + } + if !slices.Equal(banners, wantBanners) { + t.Errorf("got banners:\n%q\nwant banners:\n%q", banners, wantBanners) + } +} + type markerConn struct { closed uint32 used uint32 From 349231f7e4e437ea89847c5dfce63eed67949f86 Mon Sep 17 00:00:00 2001 From: Mariano Cano Date: Thu, 7 Sep 2023 18:54:31 -0700 Subject: [PATCH 53/95] ssh: implement CryptoPublicKey on sk keys This commit implements the CryptoPublicKey interface for the skECDSAPublicKey and skEd25519PublicKey types. Fixes golang/go#62518 Change-Id: I2b8ac89196fbb3614bf5c675127bed23f1cf6b26 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/526875 LUCI-TryBot-Result: Go LUCI Reviewed-by: Matthew Dempsky Reviewed-by: Than McIntosh Auto-Submit: Nicola Murino Reviewed-by: Nicola Murino --- ssh/keys.go | 8 ++++++++ ssh/keys_test.go | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/ssh/keys.go b/ssh/keys.go index df4ebdada5..7967665f17 100644 --- a/ssh/keys.go +++ b/ssh/keys.go @@ -904,6 +904,10 @@ func (k *skECDSAPublicKey) Verify(data []byte, sig *Signature) error { return errors.New("ssh: signature did not verify") } +func (k *skECDSAPublicKey) CryptoPublicKey() crypto.PublicKey { + return &k.PublicKey +} + type skEd25519PublicKey struct { // application is a URL-like string, typically "ssh:" for SSH. // see openssh/PROTOCOL.u2f for details. @@ -1000,6 +1004,10 @@ func (k *skEd25519PublicKey) Verify(data []byte, sig *Signature) error { return nil } +func (k *skEd25519PublicKey) CryptoPublicKey() crypto.PublicKey { + return k.PublicKey +} + // NewSignerFromKey takes an *rsa.PrivateKey, *dsa.PrivateKey, // *ecdsa.PrivateKey or any other crypto.Signer and returns a // corresponding Signer instance. ECDSA keys must use P-256, P-384 or diff --git a/ssh/keys_test.go b/ssh/keys_test.go index 36e1857039..7b14429e17 100644 --- a/ssh/keys_test.go +++ b/ssh/keys_test.go @@ -726,3 +726,49 @@ func TestNewSignerWithAlgos(t *testing.T) { t.Error("signer with algos created with restricted algorithms") } } + +func TestCryptoPublicKey(t *testing.T) { + for _, priv := range testSigners { + p1 := priv.PublicKey() + key, ok := p1.(CryptoPublicKey) + if !ok { + continue + } + p2, err := NewPublicKey(key.CryptoPublicKey()) + if err != nil { + t.Fatalf("NewPublicKey(CryptoPublicKey) failed for %s, got: %v", p1.Type(), err) + } + if !reflect.DeepEqual(p1, p2) { + t.Errorf("got %#v in NewPublicKey, want %#v", p2, p1) + } + } + for _, d := range testdata.SKData { + p1, _, _, _, err := ParseAuthorizedKey(d.PubKey) + if err != nil { + t.Fatalf("parseAuthorizedKey returned error: %v", err) + } + k1, ok := p1.(CryptoPublicKey) + if !ok { + t.Fatalf("%T does not implement CryptoPublicKey", p1) + } + + var p2 PublicKey + switch pub := k1.CryptoPublicKey().(type) { + case *ecdsa.PublicKey: + p2 = &skECDSAPublicKey{ + application: "ssh:", + PublicKey: *pub, + } + case ed25519.PublicKey: + p2 = &skEd25519PublicKey{ + application: "ssh:", + PublicKey: pub, + } + default: + t.Fatalf("unexpected type %T from CryptoPublicKey()", pub) + } + if !reflect.DeepEqual(p1, p2) { + t.Errorf("got %#v, want %#v", p2, p1) + } + } +} From 0b431c7de36a66b1b5c54f6219ad1413824cd1fd Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 3 Jun 2024 16:01:22 +0000 Subject: [PATCH 54/95] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: I8a1b9637e83214674e6fe82ebf584e9b90446ca3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/589875 Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Gopher Robot Reviewed-by: Damien Neil --- x509roots/fallback/bundle.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index 460c57b4d8..e56011afa0 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -1526,6 +1526,24 @@ u/8j72gZyxKTJ1wDLW8w0B62GqzeWvfRqqgnpv55gcR5mTNXuhKwqeBCbJPKVt7+ bYQLCIt+jerXmCHG8+c8eS9enNFMFY3h7CI3zJpDC5fcgJCNs2ebb0gIFVbPv/Er fF6adulZkMV8gzURZVE= -----END CERTIFICATE----- +# CN=FIRMAPROFESIONAL CA ROOT-A WEB,O=Firmaprofesional SA,C=ES,2.5.4.97=#130f56415445532d413632363334303638 +# bef256daf26e9c69bdec1602359798f3caf71821a03e018257c53c65617f3d4a +-----BEGIN CERTIFICATE----- +MIICejCCAgCgAwIBAgIQMZch7a+JQn81QYehZ1ZMbTAKBggqhkjOPQQDAzBuMQsw +CQYDVQQGEwJFUzEcMBoGA1UECgwTRmlybWFwcm9mZXNpb25hbCBTQTEYMBYGA1UE +YQwPVkFURVMtQTYyNjM0MDY4MScwJQYDVQQDDB5GSVJNQVBST0ZFU0lPTkFMIENB +IFJPT1QtQSBXRUIwHhcNMjIwNDA2MDkwMTM2WhcNNDcwMzMxMDkwMTM2WjBuMQsw +CQYDVQQGEwJFUzEcMBoGA1UECgwTRmlybWFwcm9mZXNpb25hbCBTQTEYMBYGA1UE +YQwPVkFURVMtQTYyNjM0MDY4MScwJQYDVQQDDB5GSVJNQVBST0ZFU0lPTkFMIENB +IFJPT1QtQSBXRUIwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAARHU+osEaR3xyrq89Zf +e9MEkVz6iMYiuYMQYneEMy3pA4jU4DP37XcsSmDq5G+tbbT4TIqk5B/K6k84Si6C +cyvHZpsKjECcfIr28jlgst7L7Ljkb+qbXbdTkBgyVcUgt5SjYzBhMA8GA1UdEwEB +/wQFMAMBAf8wHwYDVR0jBBgwFoAUk+FDY1w8ndYn81LsF7Kpryz3dvgwHQYDVR0O +BBYEFJPhQ2NcPJ3WJ/NS7Beyqa8s93b4MA4GA1UdDwEB/wQEAwIBBjAKBggqhkjO +PQQDAwNoADBlAjAdfKR7w4l1M+E7qUW/Runpod3JIha3RxEL2Jq68cgLcFBTApFw +hVmpHqTm6iMxoAACMQD94vizrxa5HnPEluPBMBnYfubDl94cT7iJLzPrSA8Z94dG +XSaQpYXFuXqUPoeovQA= +-----END CERTIFICATE----- # CN=GDCA TrustAUTH R5 ROOT,O=GUANG DONG CERTIFICATE AUTHORITY CO.\,LTD.,C=CN # bfff8fd04433487d6a8aa60c1a29767a9fc2bbb05e420f713a13b992891d3893 -----BEGIN CERTIFICATE----- From 332fd656f4f013f66e643818fe8c759538456535 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Tue, 4 Jun 2024 16:16:04 +0000 Subject: [PATCH 55/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I105ee0f343768881d4fe3a2bfd1fcbaa7e1fd705 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/590218 Auto-Submit: Gopher Robot Reviewed-by: Than McIntosh LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index e5e51cb287..cc1cf7ebed 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.20.0 - golang.org/x/term v0.20.0 + golang.org/x/sys v0.21.0 + golang.org/x/term v0.21.0 ) -require golang.org/x/text v0.15.0 // indirect +require golang.org/x/text v0.16.0 // indirect diff --git a/go.sum b/go.sum index ea7e57f729..b694c49e61 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= -golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= -golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= -golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= -golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= From d4e7c9cb6cb8bb64ad4a1988cd26328ef6cb9023 Mon Sep 17 00:00:00 2001 From: samiponkanen Date: Fri, 17 May 2024 12:09:53 +0000 Subject: [PATCH 56/95] ssh: fail client auth immediately on receiving disconnect message Fixes golang/go#66991 Change-Id: I60dd8a807578f162fda0e49bcd6fbf289d444396 GitHub-Last-Rev: f88329d35712873d0d7e3b39b9b11e7bfbc28e71 GitHub-Pull-Request: golang/crypto#293 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/581075 Reviewed-by: Cherry Mui Reviewed-by: Roland Shoemaker Auto-Submit: Nicola Murino Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI --- ssh/client_auth.go | 4 +++ ssh/test/session_test.go | 69 ++++++++++++++++++++++++++++++++++++++ ssh/test/test_unix_test.go | 9 +++-- 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/ssh/client_auth.go b/ssh/client_auth.go index 9486c59862..b93961010d 100644 --- a/ssh/client_auth.go +++ b/ssh/client_auth.go @@ -71,6 +71,10 @@ func (c *connection) clientAuthenticate(config *ClientConfig) error { for auth := AuthMethod(new(noneAuth)); auth != nil; { ok, methods, err := auth.auth(sessionID, config.User, c.transport, config.Rand, extensions) if err != nil { + // On disconnect, return error immediately + if _, ok := err.(*disconnectMsg); ok { + return err + } // We return the error later if there is no other method left to // try. ok = authFailure diff --git a/ssh/test/session_test.go b/ssh/test/session_test.go index e9bfa9ec27..53e6645633 100644 --- a/ssh/test/session_test.go +++ b/ssh/test/session_test.go @@ -468,3 +468,72 @@ func TestClientAuthAlgorithms(t *testing.T) { }) } } + +func TestClientAuthDisconnect(t *testing.T) { + // Use a static key that is not accepted by server. + // This key has been generated with following ssh-keygen command and + // used exclusively in this unit test: + // $ ssh-keygen -t RSA -b 2048 -f /tmp/static_key \ + // -C "Static RSA key for golang.org/x/crypto/ssh unit test" + + const privKeyData = `-----BEGIN OPENSSH PRIVATE KEY----- +b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABFwAAAAdzc2gtcn +NhAAAAAwEAAQAAAQEAwV1Zg3MqX27nIQQNWd8V09P4q4F1fx7H2xNJdL3Yg3y91GFLJ92+ +0IiGV8n1VMGL/71PPhzyqBpUYSTpWjiU2JZSfA+iTg1GJBcOaEOA6vrXsTtXTHZ//mOT4d +mlvuP4+9NqfCBLGXN7ZJpT+amkD8AVW9YW9QN3ipY61ZWxPaAocVpDd8rVgJTk54KvaPa7 +t4ddOSQDQq61aubIDR1Z3P+XjkB4piWOsbck3HJL+veTALy12C09tAhwUnZUAXS+DjhxOL +xpDVclF/yXYhAvBvsjwyk/OC3+nK9F799hpQZsjxmbP7oN+tGwz06BUcAKi7u7QstENvvk +85SDZy1q1QAAA/A7ylbJO8pWyQAAAAdzc2gtcnNhAAABAQDBXVmDcypfbuchBA1Z3xXT0/ +irgXV/HsfbE0l0vdiDfL3UYUsn3b7QiIZXyfVUwYv/vU8+HPKoGlRhJOlaOJTYllJ8D6JO +DUYkFw5oQ4Dq+texO1dMdn/+Y5Ph2aW+4/j702p8IEsZc3tkmlP5qaQPwBVb1hb1A3eKlj +rVlbE9oChxWkN3ytWAlOTngq9o9ru3h105JANCrrVq5sgNHVnc/5eOQHimJY6xtyTcckv6 +95MAvLXYLT20CHBSdlQBdL4OOHE4vGkNVyUX/JdiEC8G+yPDKT84Lf6cr0Xv32GlBmyPGZ +s/ug360bDPToFRwAqLu7tCy0Q2++TzlINnLWrVAAAAAwEAAQAAAQAIvPDHMiyIxgCksGPF +uyv9F9U4XjVip8/abE9zkAMJWW5++wuT/bRlBOUPRrWIXZEM9ETbtsqswo3Wxah+7CjRIH +qR7SdFlYTP1jPk4yIKXF4OvggBUPySkMpAGJ9hwOMW8Ymcb4gn77JJ4aMoWIcXssje+XiC +8iO+4UWU3SV2i6K7flK1UDCI5JVCyBr3DVf3QhMOgvwJl9TgD7FzWy1hkjuZq/Pzdv+fA2 +OfrUFiSukLNolidNoI9+KWa1yxixE+B2oN4Xan3ZbqGbL6Wc1dB+K9h/bNcu+SKf7fXWRi +/vVG44A61xGDZzen1+eQlqFp7narkKXoaU71+45VXDThAAAAgBPWUdQykEEm0yOS6hPIW+ +hS8z1LXWGTEcag9fMwJXKE7cQFO3LEk+dXMbClHdhD/ydswOZYGSNepxwvmo/a5LiO2ulp +W+5tnsNhcK3skdaf71t+boUEXBNZ6u3WNTkU7tDN8h9tebI+xlNceDGSGjOlNoHQVMKZdA +W9TA4ZqXUPAAAAgQDWU0UZVOSCAOODPz4PYsbFKdCfXNP8O4+t9txyc9E3hsLAsVs+CpVX +Gr219MGLrublzAxojipyzuQb6Tp1l9nsu7VkcBrPL8I1tokz0AyTnmNF3A9KszBal7gGNS +a2qYuf6JO4cub1KzonxUJQHZPZq9YhCxOtDwTd+uyHZiPy9QAAAIEA5vayd+nfVJgCKTdf +z5MFsxBSUj/cAYg7JYPS/0bZ5bEkLosL22wl5Tm/ZftJa8apkyBPhguAWt6jEWLoDiK+kn +Fv0SaEq1HUdXgWmISVnWzv2pxdAtq/apmbxTg3iIJyrAwEDo13iImR3k6rNPx1m3i/jX56 +HLcvWM4Y6bFzbGEAAAA0U3RhdGljIFJTQSBrZXkgZm9yIGdvbGFuZy5vcmcveC9jcnlwdG +8vc3NoIHVuaXQgdGVzdAECAwQFBgc= +-----END OPENSSH PRIVATE KEY-----` + + signer, err := ssh.ParsePrivateKey([]byte(privKeyData)) + if err != nil { + t.Fatalf("failed to create signer from key: %v", err) + } + + // Start server with MaxAuthTries 1 and publickey and password auth + // enabled + server := newServerForConfig(t, "MaxAuthTries", map[string]string{}) + + // Connect to server, expect failure, that PublicKeysCallback is called + // and that PasswordCallback is not called. + publicKeysCallbackCalled := false + config := clientConfig() + config.Auth = []ssh.AuthMethod{ + ssh.PublicKeysCallback(func() ([]ssh.Signer, error) { + publicKeysCallbackCalled = true + return []ssh.Signer{signer}, nil + }), + ssh.PasswordCallback(func() (string, error) { + t.Errorf("unexpected call to PasswordCallback()") + return "notaverygoodpassword", nil + }), + } + client, err := server.TryDial(config) + if err == nil { + t.Errorf("expected TryDial() to fail") + _ = client.Close() + } + if !publicKeysCallbackCalled { + t.Errorf("expected PublicKeysCallback() to be called") + } +} diff --git a/ssh/test/test_unix_test.go b/ssh/test/test_unix_test.go index 9695de7319..12698e49ab 100644 --- a/ssh/test/test_unix_test.go +++ b/ssh/test/test_unix_test.go @@ -58,12 +58,17 @@ UsePAM yes PasswordAuthentication yes ChallengeResponseAuthentication yes AuthenticationMethods {{.AuthMethods}} +` + maxAuthTriesSshdConfigTail = ` +PasswordAuthentication yes +MaxAuthTries 1 ` ) var configTmpl = map[string]*template.Template{ - "default": template.Must(template.New("").Parse(defaultSshdConfig)), - "MultiAuth": template.Must(template.New("").Parse(defaultSshdConfig + multiAuthSshdConfigTail))} + "default": template.Must(template.New("").Parse(defaultSshdConfig)), + "MultiAuth": template.Must(template.New("").Parse(defaultSshdConfig + multiAuthSshdConfigTail)), + "MaxAuthTries": template.Must(template.New("").Parse(defaultSshdConfig + maxAuthTriesSshdConfigTail))} type server struct { t *testing.T From 1c7450041f58a5e714eb084ae0e670d3576ea9fb Mon Sep 17 00:00:00 2001 From: cuishuang Date: Tue, 25 Jun 2024 00:01:42 +0800 Subject: [PATCH 57/95] ssh/test: make struct comment match struct name Change-Id: I9bfd61fe96d2bdaa890379a1a31b7e0f3f2b67ed Reviewed-on: https://go-review.googlesource.com/c/crypto/+/594435 Reviewed-by: Ian Lance Taylor Auto-Submit: Ian Lance Taylor Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI Run-TryBot: shuang cui Reviewed-by: Joedian Reid TryBot-Result: Gopher Robot --- ssh/test/server_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ssh/test/server_test.go b/ssh/test/server_test.go index e70241bc6e..5c04fba98c 100644 --- a/ssh/test/server_test.go +++ b/ssh/test/server_test.go @@ -14,7 +14,7 @@ type exitStatusMsg struct { Status uint32 } -// goServer is a test Go SSH server that accepts public key and certificate +// goTestServer is a test Go SSH server that accepts public key and certificate // authentication and replies with a 0 exit status to any exec request without // running any commands. type goTestServer struct { From a6a393ffd658b286f64f141b06cbd94e516d3a64 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 24 Jun 2024 13:31:21 +0200 Subject: [PATCH 58/95] all: bump go.mod version and drop compatibility shims Also, remove the legacy import annotations. Fixes golang/go#68147 Change-Id: Ibfcc9322f27224c0ba92ea42cd56912a7d8783fd Reviewed-on: https://go-review.googlesource.com/c/crypto/+/594256 Reviewed-by: Dmitri Shuralyov Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker --- acme/http.go | 21 +- acme/version_go112.go | 27 - bcrypt/bcrypt.go | 2 +- blake2s/blake2s.go | 10 +- blake2s/register.go | 21 - blowfish/cipher.go | 2 +- bn256/bn256.go | 2 +- cast5/cast5.go | 2 +- chacha20poly1305/chacha20poly1305.go | 2 +- cryptobyte/asn1/asn1.go | 2 +- cryptobyte/string.go | 2 +- curve25519/curve25519.go | 39 +- curve25519/curve25519_compat.go | 105 ---- curve25519/curve25519_go120.go | 46 -- curve25519/internal/field/README | 7 - .../internal/field/_asm/fe_amd64_asm.go | 298 ---------- curve25519/internal/field/_asm/go.mod | 16 - curve25519/internal/field/_asm/go.sum | 51 -- curve25519/internal/field/fe.go | 416 ------------- curve25519/internal/field/fe_alias_test.go | 126 ---- curve25519/internal/field/fe_amd64.go | 15 - curve25519/internal/field/fe_amd64.s | 378 ------------ curve25519/internal/field/fe_amd64_noasm.go | 11 - curve25519/internal/field/fe_arm64.go | 15 - curve25519/internal/field/fe_arm64.s | 42 -- curve25519/internal/field/fe_arm64_noasm.go | 11 - curve25519/internal/field/fe_bench_test.go | 36 -- curve25519/internal/field/fe_generic.go | 264 --------- curve25519/internal/field/fe_test.go | 558 ------------------ curve25519/internal/field/sync.checkpoint | 1 - curve25519/internal/field/sync.sh | 19 - ed25519/ed25519.go | 4 +- go.mod | 2 +- hkdf/hkdf.go | 2 +- internal/wycheproof/ecdh_stdlib_test.go | 2 - internal/wycheproof/eddsa_test.go | 2 - md4/md4.go | 2 +- nacl/box/box.go | 2 +- nacl/secretbox/secretbox.go | 2 +- ocsp/ocsp.go | 2 +- ocsp/ocsp_test.go | 2 - openpgp/armor/armor.go | 5 +- openpgp/clearsign/clearsign.go | 2 +- openpgp/elgamal/elgamal.go | 2 +- openpgp/errors/errors.go | 2 +- openpgp/packet/packet.go | 2 +- openpgp/read.go | 2 +- openpgp/s2k/s2k.go | 2 +- otr/otr.go | 2 +- pbkdf2/pbkdf2.go | 2 +- poly1305/poly1305_compat.go | 2 +- ripemd160/ripemd160.go | 2 +- salsa20/salsa/hsalsa20.go | 2 +- salsa20/salsa20.go | 2 +- scrypt/scrypt.go | 2 +- sha3/doc.go | 2 +- sha3/hashes.go | 8 + sha3/register.go | 18 - ssh/agent/client.go | 2 +- ssh/doc.go | 2 +- ssh/test/doc.go | 2 +- ssh/testdata/doc.go | 2 +- twofish/twofish.go | 2 +- xtea/cipher.go | 2 +- xts/xts.go | 2 +- 65 files changed, 110 insertions(+), 2532 deletions(-) delete mode 100644 acme/version_go112.go delete mode 100644 blake2s/register.go delete mode 100644 curve25519/curve25519_compat.go delete mode 100644 curve25519/curve25519_go120.go delete mode 100644 curve25519/internal/field/README delete mode 100644 curve25519/internal/field/_asm/fe_amd64_asm.go delete mode 100644 curve25519/internal/field/_asm/go.mod delete mode 100644 curve25519/internal/field/_asm/go.sum delete mode 100644 curve25519/internal/field/fe.go delete mode 100644 curve25519/internal/field/fe_alias_test.go delete mode 100644 curve25519/internal/field/fe_amd64.go delete mode 100644 curve25519/internal/field/fe_amd64.s delete mode 100644 curve25519/internal/field/fe_amd64_noasm.go delete mode 100644 curve25519/internal/field/fe_arm64.go delete mode 100644 curve25519/internal/field/fe_arm64.s delete mode 100644 curve25519/internal/field/fe_arm64_noasm.go delete mode 100644 curve25519/internal/field/fe_bench_test.go delete mode 100644 curve25519/internal/field/fe_generic.go delete mode 100644 curve25519/internal/field/fe_test.go delete mode 100644 curve25519/internal/field/sync.checkpoint delete mode 100755 curve25519/internal/field/sync.sh delete mode 100644 sha3/register.go diff --git a/acme/http.go b/acme/http.go index 58836e5d30..d92ff232fe 100644 --- a/acme/http.go +++ b/acme/http.go @@ -15,6 +15,7 @@ import ( "io" "math/big" "net/http" + "runtime/debug" "strconv" "strings" "time" @@ -271,9 +272,27 @@ func (c *Client) httpClient() *http.Client { } // packageVersion is the version of the module that contains this package, for -// sending as part of the User-Agent header. It's set in version_go112.go. +// sending as part of the User-Agent header. var packageVersion string +func init() { + // Set packageVersion if the binary was built in modules mode and x/crypto + // was not replaced with a different module. + info, ok := debug.ReadBuildInfo() + if !ok { + return + } + for _, m := range info.Deps { + if m.Path != "golang.org/x/crypto" { + continue + } + if m.Replace == nil { + packageVersion = m.Version + } + break + } +} + // userAgent returns the User-Agent header value. It includes the package name, // the module version (if available), and the c.UserAgent value (if set). func (c *Client) userAgent() string { diff --git a/acme/version_go112.go b/acme/version_go112.go deleted file mode 100644 index cc5fab604b..0000000000 --- a/acme/version_go112.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.12 - -package acme - -import "runtime/debug" - -func init() { - // Set packageVersion if the binary was built in modules mode and x/crypto - // was not replaced with a different module. - info, ok := debug.ReadBuildInfo() - if !ok { - return - } - for _, m := range info.Deps { - if m.Path != "golang.org/x/crypto" { - continue - } - if m.Replace == nil { - packageVersion = m.Version - } - break - } -} diff --git a/bcrypt/bcrypt.go b/bcrypt/bcrypt.go index 5577c0f939..dc9311870a 100644 --- a/bcrypt/bcrypt.go +++ b/bcrypt/bcrypt.go @@ -4,7 +4,7 @@ // Package bcrypt implements Provos and Mazières's bcrypt adaptive hashing // algorithm. See http://www.usenix.org/event/usenix99/provos/provos.pdf -package bcrypt // import "golang.org/x/crypto/bcrypt" +package bcrypt // The code is a port of Provos and Mazières's C implementation. import ( diff --git a/blake2s/blake2s.go b/blake2s/blake2s.go index e3f46aab3a..c25d07d4f4 100644 --- a/blake2s/blake2s.go +++ b/blake2s/blake2s.go @@ -16,9 +16,10 @@ // // BLAKE2X is a construction to compute hash values larger than 32 bytes. It // can produce hash values between 0 and 65535 bytes. -package blake2s // import "golang.org/x/crypto/blake2s" +package blake2s import ( + "crypto" "encoding/binary" "errors" "hash" @@ -55,6 +56,13 @@ func Sum256(data []byte) [Size]byte { // and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash. func New256(key []byte) (hash.Hash, error) { return newDigest(Size, key) } +func init() { + crypto.RegisterHash(crypto.BLAKE2s_256, func() hash.Hash { + h, _ := New256(nil) + return h + }) +} + // New128 returns a new hash.Hash computing the BLAKE2s-128 checksum given a // non-empty key. Note that a 128-bit digest is too small to be secure as a // cryptographic hash and should only be used as a MAC, thus the key argument diff --git a/blake2s/register.go b/blake2s/register.go deleted file mode 100644 index 3156148a42..0000000000 --- a/blake2s/register.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.9 - -package blake2s - -import ( - "crypto" - "hash" -) - -func init() { - newHash256 := func() hash.Hash { - h, _ := New256(nil) - return h - } - - crypto.RegisterHash(crypto.BLAKE2s_256, newHash256) -} diff --git a/blowfish/cipher.go b/blowfish/cipher.go index 213bf204af..0898956807 100644 --- a/blowfish/cipher.go +++ b/blowfish/cipher.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package blowfish // import "golang.org/x/crypto/blowfish" +package blowfish // The code is a port of Bruce Schneier's C implementation. // See https://www.schneier.com/blowfish.html. diff --git a/bn256/bn256.go b/bn256/bn256.go index 5d6d198bcb..6661687551 100644 --- a/bn256/bn256.go +++ b/bn256/bn256.go @@ -23,7 +23,7 @@ // elliptic curve. This package is frozen, and not implemented in constant time. // There is a more complete implementation at github.com/cloudflare/bn256, but // note that it suffers from the same security issues of the underlying curve. -package bn256 // import "golang.org/x/crypto/bn256" +package bn256 import ( "crypto/rand" diff --git a/cast5/cast5.go b/cast5/cast5.go index 425e8eecb0..016e90215c 100644 --- a/cast5/cast5.go +++ b/cast5/cast5.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package cast5 // import "golang.org/x/crypto/cast5" +package cast5 import ( "errors" diff --git a/chacha20poly1305/chacha20poly1305.go b/chacha20poly1305/chacha20poly1305.go index 93da7322bc..8cf5d8112e 100644 --- a/chacha20poly1305/chacha20poly1305.go +++ b/chacha20poly1305/chacha20poly1305.go @@ -5,7 +5,7 @@ // Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its // extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and // draft-irtf-cfrg-xchacha-01. -package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305" +package chacha20poly1305 import ( "crypto/cipher" diff --git a/cryptobyte/asn1/asn1.go b/cryptobyte/asn1/asn1.go index cda8e3edfd..90ef6a241d 100644 --- a/cryptobyte/asn1/asn1.go +++ b/cryptobyte/asn1/asn1.go @@ -4,7 +4,7 @@ // Package asn1 contains supporting types for parsing and building ASN.1 // messages with the cryptobyte package. -package asn1 // import "golang.org/x/crypto/cryptobyte/asn1" +package asn1 // Tag represents an ASN.1 identifier octet, consisting of a tag number // (indicating a type) and class (such as context-specific or constructed). diff --git a/cryptobyte/string.go b/cryptobyte/string.go index 10692a8a31..4b0f8097f9 100644 --- a/cryptobyte/string.go +++ b/cryptobyte/string.go @@ -15,7 +15,7 @@ // // See the documentation and examples for the Builder and String types to get // started. -package cryptobyte // import "golang.org/x/crypto/cryptobyte" +package cryptobyte // String represents a string of bytes. It provides methods for parsing // fixed-length and length-prefixed values from it. diff --git a/curve25519/curve25519.go b/curve25519/curve25519.go index 00f963ea20..21ca3b2ee4 100644 --- a/curve25519/curve25519.go +++ b/curve25519/curve25519.go @@ -6,9 +6,11 @@ // performs scalar multiplication on the elliptic curve known as Curve25519. // See RFC 7748. // -// Starting in Go 1.20, this package is a wrapper for the X25519 implementation +// This package is a wrapper for the X25519 implementation // in the crypto/ecdh package. -package curve25519 // import "golang.org/x/crypto/curve25519" +package curve25519 + +import "crypto/ecdh" // ScalarMult sets dst to the product scalar * point. // @@ -16,7 +18,13 @@ package curve25519 // import "golang.org/x/crypto/curve25519" // zeroes, irrespective of the scalar. Instead, use the X25519 function, which // will return an error. func ScalarMult(dst, scalar, point *[32]byte) { - scalarMult(dst, scalar, point) + if _, err := x25519(dst, scalar[:], point[:]); err != nil { + // The only error condition for x25519 when the inputs are 32 bytes long + // is if the output would have been the all-zero value. + for i := range dst { + dst[i] = 0 + } + } } // ScalarBaseMult sets dst to the product scalar * base where base is the @@ -25,7 +33,12 @@ func ScalarMult(dst, scalar, point *[32]byte) { // It is recommended to use the X25519 function with Basepoint instead, as // copying into fixed size arrays can lead to unexpected bugs. func ScalarBaseMult(dst, scalar *[32]byte) { - scalarBaseMult(dst, scalar) + curve := ecdh.X25519() + priv, err := curve.NewPrivateKey(scalar[:]) + if err != nil { + panic("curve25519: internal error: scalarBaseMult was not 32 bytes") + } + copy(dst[:], priv.PublicKey().Bytes()) } const ( @@ -57,3 +70,21 @@ func X25519(scalar, point []byte) ([]byte, error) { var dst [32]byte return x25519(&dst, scalar, point) } + +func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { + curve := ecdh.X25519() + pub, err := curve.NewPublicKey(point) + if err != nil { + return nil, err + } + priv, err := curve.NewPrivateKey(scalar) + if err != nil { + return nil, err + } + out, err := priv.ECDH(pub) + if err != nil { + return nil, err + } + copy(dst[:], out) + return dst[:], nil +} diff --git a/curve25519/curve25519_compat.go b/curve25519/curve25519_compat.go deleted file mode 100644 index ba647e8d77..0000000000 --- a/curve25519/curve25519_compat.go +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.20 - -package curve25519 - -import ( - "crypto/subtle" - "errors" - "strconv" - - "golang.org/x/crypto/curve25519/internal/field" -) - -func scalarMult(dst, scalar, point *[32]byte) { - var e [32]byte - - copy(e[:], scalar[:]) - e[0] &= 248 - e[31] &= 127 - e[31] |= 64 - - var x1, x2, z2, x3, z3, tmp0, tmp1 field.Element - x1.SetBytes(point[:]) - x2.One() - x3.Set(&x1) - z3.One() - - swap := 0 - for pos := 254; pos >= 0; pos-- { - b := e[pos/8] >> uint(pos&7) - b &= 1 - swap ^= int(b) - x2.Swap(&x3, swap) - z2.Swap(&z3, swap) - swap = int(b) - - tmp0.Subtract(&x3, &z3) - tmp1.Subtract(&x2, &z2) - x2.Add(&x2, &z2) - z2.Add(&x3, &z3) - z3.Multiply(&tmp0, &x2) - z2.Multiply(&z2, &tmp1) - tmp0.Square(&tmp1) - tmp1.Square(&x2) - x3.Add(&z3, &z2) - z2.Subtract(&z3, &z2) - x2.Multiply(&tmp1, &tmp0) - tmp1.Subtract(&tmp1, &tmp0) - z2.Square(&z2) - - z3.Mult32(&tmp1, 121666) - x3.Square(&x3) - tmp0.Add(&tmp0, &z3) - z3.Multiply(&x1, &z2) - z2.Multiply(&tmp1, &tmp0) - } - - x2.Swap(&x3, swap) - z2.Swap(&z3, swap) - - z2.Invert(&z2) - x2.Multiply(&x2, &z2) - copy(dst[:], x2.Bytes()) -} - -func scalarBaseMult(dst, scalar *[32]byte) { - checkBasepoint() - scalarMult(dst, scalar, &basePoint) -} - -func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { - var in [32]byte - if l := len(scalar); l != 32 { - return nil, errors.New("bad scalar length: " + strconv.Itoa(l) + ", expected 32") - } - if l := len(point); l != 32 { - return nil, errors.New("bad point length: " + strconv.Itoa(l) + ", expected 32") - } - copy(in[:], scalar) - if &point[0] == &Basepoint[0] { - scalarBaseMult(dst, &in) - } else { - var base, zero [32]byte - copy(base[:], point) - scalarMult(dst, &in, &base) - if subtle.ConstantTimeCompare(dst[:], zero[:]) == 1 { - return nil, errors.New("bad input point: low order point") - } - } - return dst[:], nil -} - -func checkBasepoint() { - if subtle.ConstantTimeCompare(Basepoint, []byte{ - 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - }) != 1 { - panic("curve25519: global Basepoint value was modified") - } -} diff --git a/curve25519/curve25519_go120.go b/curve25519/curve25519_go120.go deleted file mode 100644 index 627df49727..0000000000 --- a/curve25519/curve25519_go120.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.20 - -package curve25519 - -import "crypto/ecdh" - -func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { - curve := ecdh.X25519() - pub, err := curve.NewPublicKey(point) - if err != nil { - return nil, err - } - priv, err := curve.NewPrivateKey(scalar) - if err != nil { - return nil, err - } - out, err := priv.ECDH(pub) - if err != nil { - return nil, err - } - copy(dst[:], out) - return dst[:], nil -} - -func scalarMult(dst, scalar, point *[32]byte) { - if _, err := x25519(dst, scalar[:], point[:]); err != nil { - // The only error condition for x25519 when the inputs are 32 bytes long - // is if the output would have been the all-zero value. - for i := range dst { - dst[i] = 0 - } - } -} - -func scalarBaseMult(dst, scalar *[32]byte) { - curve := ecdh.X25519() - priv, err := curve.NewPrivateKey(scalar[:]) - if err != nil { - panic("curve25519: internal error: scalarBaseMult was not 32 bytes") - } - copy(dst[:], priv.PublicKey().Bytes()) -} diff --git a/curve25519/internal/field/README b/curve25519/internal/field/README deleted file mode 100644 index e25bca7dc8..0000000000 --- a/curve25519/internal/field/README +++ /dev/null @@ -1,7 +0,0 @@ -This package is kept in sync with crypto/ed25519/internal/edwards25519/field in -the standard library. - -If there are any changes in the standard library that need to be synced to this -package, run sync.sh. It will not overwrite any local changes made since the -previous sync, so it's ok to land changes in this package first, and then sync -to the standard library later. diff --git a/curve25519/internal/field/_asm/fe_amd64_asm.go b/curve25519/internal/field/_asm/fe_amd64_asm.go deleted file mode 100644 index 1f3652987e..0000000000 --- a/curve25519/internal/field/_asm/fe_amd64_asm.go +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (c) 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package main - -import ( - "fmt" - - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/gotypes" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" - - // Ensure "go mod tidy" doesn't remove the golang.org/x/crypto module - // dependency, which is necessary to access the field.Element type. - _ "golang.org/x/crypto/curve25519" -) - -//go:generate go run . -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field - -func main() { - Package("golang.org/x/crypto/curve25519/internal/field") - ConstraintExpr("amd64,gc,!purego") - feMul() - feSquare() - Generate() -} - -type namedComponent struct { - Component - name string -} - -func (c namedComponent) String() string { return c.name } - -type uint128 struct { - name string - hi, lo GPVirtual -} - -func (c uint128) String() string { return c.name } - -func feSquare() { - TEXT("feSquare", NOSPLIT, "func(out, a *Element)") - Doc("feSquare sets out = a * a. It works like feSquareGeneric.") - Pragma("noescape") - - a := Dereference(Param("a")) - l0 := namedComponent{a.Field("l0"), "l0"} - l1 := namedComponent{a.Field("l1"), "l1"} - l2 := namedComponent{a.Field("l2"), "l2"} - l3 := namedComponent{a.Field("l3"), "l3"} - l4 := namedComponent{a.Field("l4"), "l4"} - - // r0 = l0×l0 + 19×2×(l1×l4 + l2×l3) - r0 := uint128{"r0", GP64(), GP64()} - mul64(r0, 1, l0, l0) - addMul64(r0, 38, l1, l4) - addMul64(r0, 38, l2, l3) - - // r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3 - r1 := uint128{"r1", GP64(), GP64()} - mul64(r1, 2, l0, l1) - addMul64(r1, 38, l2, l4) - addMul64(r1, 19, l3, l3) - - // r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4 - r2 := uint128{"r2", GP64(), GP64()} - mul64(r2, 2, l0, l2) - addMul64(r2, 1, l1, l1) - addMul64(r2, 38, l3, l4) - - // r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4 - r3 := uint128{"r3", GP64(), GP64()} - mul64(r3, 2, l0, l3) - addMul64(r3, 2, l1, l2) - addMul64(r3, 19, l4, l4) - - // r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2 - r4 := uint128{"r4", GP64(), GP64()} - mul64(r4, 2, l0, l4) - addMul64(r4, 2, l1, l3) - addMul64(r4, 1, l2, l2) - - Comment("First reduction chain") - maskLow51Bits := GP64() - MOVQ(Imm((1<<51)-1), maskLow51Bits) - c0, r0lo := shiftRightBy51(&r0) - c1, r1lo := shiftRightBy51(&r1) - c2, r2lo := shiftRightBy51(&r2) - c3, r3lo := shiftRightBy51(&r3) - c4, r4lo := shiftRightBy51(&r4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Second reduction chain (carryPropagate)") - // c0 = r0 >> 51 - MOVQ(r0lo, c0) - SHRQ(Imm(51), c0) - // c1 = r1 >> 51 - MOVQ(r1lo, c1) - SHRQ(Imm(51), c1) - // c2 = r2 >> 51 - MOVQ(r2lo, c2) - SHRQ(Imm(51), c2) - // c3 = r3 >> 51 - MOVQ(r3lo, c3) - SHRQ(Imm(51), c3) - // c4 = r4 >> 51 - MOVQ(r4lo, c4) - SHRQ(Imm(51), c4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Store output") - out := Dereference(Param("out")) - Store(r0lo, out.Field("l0")) - Store(r1lo, out.Field("l1")) - Store(r2lo, out.Field("l2")) - Store(r3lo, out.Field("l3")) - Store(r4lo, out.Field("l4")) - - RET() -} - -func feMul() { - TEXT("feMul", NOSPLIT, "func(out, a, b *Element)") - Doc("feMul sets out = a * b. It works like feMulGeneric.") - Pragma("noescape") - - a := Dereference(Param("a")) - a0 := namedComponent{a.Field("l0"), "a0"} - a1 := namedComponent{a.Field("l1"), "a1"} - a2 := namedComponent{a.Field("l2"), "a2"} - a3 := namedComponent{a.Field("l3"), "a3"} - a4 := namedComponent{a.Field("l4"), "a4"} - - b := Dereference(Param("b")) - b0 := namedComponent{b.Field("l0"), "b0"} - b1 := namedComponent{b.Field("l1"), "b1"} - b2 := namedComponent{b.Field("l2"), "b2"} - b3 := namedComponent{b.Field("l3"), "b3"} - b4 := namedComponent{b.Field("l4"), "b4"} - - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - r0 := uint128{"r0", GP64(), GP64()} - mul64(r0, 1, a0, b0) - addMul64(r0, 19, a1, b4) - addMul64(r0, 19, a2, b3) - addMul64(r0, 19, a3, b2) - addMul64(r0, 19, a4, b1) - - // r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2) - r1 := uint128{"r1", GP64(), GP64()} - mul64(r1, 1, a0, b1) - addMul64(r1, 1, a1, b0) - addMul64(r1, 19, a2, b4) - addMul64(r1, 19, a3, b3) - addMul64(r1, 19, a4, b2) - - // r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3) - r2 := uint128{"r2", GP64(), GP64()} - mul64(r2, 1, a0, b2) - addMul64(r2, 1, a1, b1) - addMul64(r2, 1, a2, b0) - addMul64(r2, 19, a3, b4) - addMul64(r2, 19, a4, b3) - - // r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4 - r3 := uint128{"r3", GP64(), GP64()} - mul64(r3, 1, a0, b3) - addMul64(r3, 1, a1, b2) - addMul64(r3, 1, a2, b1) - addMul64(r3, 1, a3, b0) - addMul64(r3, 19, a4, b4) - - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - r4 := uint128{"r4", GP64(), GP64()} - mul64(r4, 1, a0, b4) - addMul64(r4, 1, a1, b3) - addMul64(r4, 1, a2, b2) - addMul64(r4, 1, a3, b1) - addMul64(r4, 1, a4, b0) - - Comment("First reduction chain") - maskLow51Bits := GP64() - MOVQ(Imm((1<<51)-1), maskLow51Bits) - c0, r0lo := shiftRightBy51(&r0) - c1, r1lo := shiftRightBy51(&r1) - c2, r2lo := shiftRightBy51(&r2) - c3, r3lo := shiftRightBy51(&r3) - c4, r4lo := shiftRightBy51(&r4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Second reduction chain (carryPropagate)") - // c0 = r0 >> 51 - MOVQ(r0lo, c0) - SHRQ(Imm(51), c0) - // c1 = r1 >> 51 - MOVQ(r1lo, c1) - SHRQ(Imm(51), c1) - // c2 = r2 >> 51 - MOVQ(r2lo, c2) - SHRQ(Imm(51), c2) - // c3 = r3 >> 51 - MOVQ(r3lo, c3) - SHRQ(Imm(51), c3) - // c4 = r4 >> 51 - MOVQ(r4lo, c4) - SHRQ(Imm(51), c4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Store output") - out := Dereference(Param("out")) - Store(r0lo, out.Field("l0")) - Store(r1lo, out.Field("l1")) - Store(r2lo, out.Field("l2")) - Store(r3lo, out.Field("l3")) - Store(r4lo, out.Field("l4")) - - RET() -} - -// mul64 sets r to i * aX * bX. -func mul64(r uint128, i int, aX, bX namedComponent) { - switch i { - case 1: - Comment(fmt.Sprintf("%s = %s×%s", r, aX, bX)) - Load(aX, RAX) - case 2: - Comment(fmt.Sprintf("%s = 2×%s×%s", r, aX, bX)) - Load(aX, RAX) - SHLQ(Imm(1), RAX) - default: - panic("unsupported i value") - } - MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX - MOVQ(RAX, r.lo) - MOVQ(RDX, r.hi) -} - -// addMul64 sets r to r + i * aX * bX. -func addMul64(r uint128, i uint64, aX, bX namedComponent) { - switch i { - case 1: - Comment(fmt.Sprintf("%s += %s×%s", r, aX, bX)) - Load(aX, RAX) - default: - Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX)) - IMUL3Q(Imm(i), Load(aX, GP64()), RAX) - } - MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX - ADDQ(RAX, r.lo) - ADCQ(RDX, r.hi) -} - -// shiftRightBy51 returns r >> 51 and r.lo. -// -// After this function is called, the uint128 may not be used anymore. -func shiftRightBy51(r *uint128) (out, lo GPVirtual) { - out = r.hi - lo = r.lo - SHLQ(Imm(64-51), r.lo, r.hi) - r.lo, r.hi = nil, nil // make sure the uint128 is unusable - return -} - -// maskAndAdd sets r = r&mask + c*i. -func maskAndAdd(r, mask, c GPVirtual, i uint64) { - ANDQ(mask, r) - if i != 1 { - IMUL3Q(Imm(i), c, c) - } - ADDQ(c, r) -} - -func mustAddr(c Component) Op { - b, err := c.Resolve() - if err != nil { - panic(err) - } - return b.Addr -} diff --git a/curve25519/internal/field/_asm/go.mod b/curve25519/internal/field/_asm/go.mod deleted file mode 100644 index f6902f4a64..0000000000 --- a/curve25519/internal/field/_asm/go.mod +++ /dev/null @@ -1,16 +0,0 @@ -module asm - -go 1.18 - -require ( - github.com/mmcloughlin/avo v0.5.0 - golang.org/x/crypto v0.1.0 -) - -require ( - golang.org/x/mod v0.8.0 // indirect - golang.org/x/sys v0.14.0 // indirect - golang.org/x/tools v0.6.0 // indirect -) - -replace golang.org/x/crypto v0.1.0 => ../../../.. diff --git a/curve25519/internal/field/_asm/go.sum b/curve25519/internal/field/_asm/go.sum deleted file mode 100644 index 96b7915d4f..0000000000 --- a/curve25519/internal/field/_asm/go.sum +++ /dev/null @@ -1,51 +0,0 @@ -github.com/mmcloughlin/avo v0.5.0 h1:nAco9/aI9Lg2kiuROBY6BhCI/z0t5jEvJfjWbL8qXLU= -github.com/mmcloughlin/avo v0.5.0/go.mod h1:ChHFdoV7ql95Wi7vuq2YT1bwCJqiWdZrQ1im3VujLYM= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -golang.org/x/arch v0.1.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.6.0/go.mod h1:4mET923SAdbXp2ki8ey+zGs1SLqsuM2Y0uvdZR/fUNI= -golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= -golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.2.0/go.mod h1:y4OqIKeOV/fWJetJ8bXPU1sEVniLMIyDAZWeHdV+NTA= -golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/curve25519/internal/field/fe.go b/curve25519/internal/field/fe.go deleted file mode 100644 index ca841ad99e..0000000000 --- a/curve25519/internal/field/fe.go +++ /dev/null @@ -1,416 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package field implements fast arithmetic modulo 2^255-19. -package field - -import ( - "crypto/subtle" - "encoding/binary" - "math/bits" -) - -// Element represents an element of the field GF(2^255-19). Note that this -// is not a cryptographically secure group, and should only be used to interact -// with edwards25519.Point coordinates. -// -// This type works similarly to math/big.Int, and all arguments and receivers -// are allowed to alias. -// -// The zero value is a valid zero element. -type Element struct { - // An element t represents the integer - // t.l0 + t.l1*2^51 + t.l2*2^102 + t.l3*2^153 + t.l4*2^204 - // - // Between operations, all limbs are expected to be lower than 2^52. - l0 uint64 - l1 uint64 - l2 uint64 - l3 uint64 - l4 uint64 -} - -const maskLow51Bits uint64 = (1 << 51) - 1 - -var feZero = &Element{0, 0, 0, 0, 0} - -// Zero sets v = 0, and returns v. -func (v *Element) Zero() *Element { - *v = *feZero - return v -} - -var feOne = &Element{1, 0, 0, 0, 0} - -// One sets v = 1, and returns v. -func (v *Element) One() *Element { - *v = *feOne - return v -} - -// reduce reduces v modulo 2^255 - 19 and returns it. -func (v *Element) reduce() *Element { - v.carryPropagate() - - // After the light reduction we now have a field element representation - // v < 2^255 + 2^13 * 19, but need v < 2^255 - 19. - - // If v >= 2^255 - 19, then v + 19 >= 2^255, which would overflow 2^255 - 1, - // generating a carry. That is, c will be 0 if v < 2^255 - 19, and 1 otherwise. - c := (v.l0 + 19) >> 51 - c = (v.l1 + c) >> 51 - c = (v.l2 + c) >> 51 - c = (v.l3 + c) >> 51 - c = (v.l4 + c) >> 51 - - // If v < 2^255 - 19 and c = 0, this will be a no-op. Otherwise, it's - // effectively applying the reduction identity to the carry. - v.l0 += 19 * c - - v.l1 += v.l0 >> 51 - v.l0 = v.l0 & maskLow51Bits - v.l2 += v.l1 >> 51 - v.l1 = v.l1 & maskLow51Bits - v.l3 += v.l2 >> 51 - v.l2 = v.l2 & maskLow51Bits - v.l4 += v.l3 >> 51 - v.l3 = v.l3 & maskLow51Bits - // no additional carry - v.l4 = v.l4 & maskLow51Bits - - return v -} - -// Add sets v = a + b, and returns v. -func (v *Element) Add(a, b *Element) *Element { - v.l0 = a.l0 + b.l0 - v.l1 = a.l1 + b.l1 - v.l2 = a.l2 + b.l2 - v.l3 = a.l3 + b.l3 - v.l4 = a.l4 + b.l4 - // Using the generic implementation here is actually faster than the - // assembly. Probably because the body of this function is so simple that - // the compiler can figure out better optimizations by inlining the carry - // propagation. TODO - return v.carryPropagateGeneric() -} - -// Subtract sets v = a - b, and returns v. -func (v *Element) Subtract(a, b *Element) *Element { - // We first add 2 * p, to guarantee the subtraction won't underflow, and - // then subtract b (which can be up to 2^255 + 2^13 * 19). - v.l0 = (a.l0 + 0xFFFFFFFFFFFDA) - b.l0 - v.l1 = (a.l1 + 0xFFFFFFFFFFFFE) - b.l1 - v.l2 = (a.l2 + 0xFFFFFFFFFFFFE) - b.l2 - v.l3 = (a.l3 + 0xFFFFFFFFFFFFE) - b.l3 - v.l4 = (a.l4 + 0xFFFFFFFFFFFFE) - b.l4 - return v.carryPropagate() -} - -// Negate sets v = -a, and returns v. -func (v *Element) Negate(a *Element) *Element { - return v.Subtract(feZero, a) -} - -// Invert sets v = 1/z mod p, and returns v. -// -// If z == 0, Invert returns v = 0. -func (v *Element) Invert(z *Element) *Element { - // Inversion is implemented as exponentiation with exponent p − 2. It uses the - // same sequence of 255 squarings and 11 multiplications as [Curve25519]. - var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t Element - - z2.Square(z) // 2 - t.Square(&z2) // 4 - t.Square(&t) // 8 - z9.Multiply(&t, z) // 9 - z11.Multiply(&z9, &z2) // 11 - t.Square(&z11) // 22 - z2_5_0.Multiply(&t, &z9) // 31 = 2^5 - 2^0 - - t.Square(&z2_5_0) // 2^6 - 2^1 - for i := 0; i < 4; i++ { - t.Square(&t) // 2^10 - 2^5 - } - z2_10_0.Multiply(&t, &z2_5_0) // 2^10 - 2^0 - - t.Square(&z2_10_0) // 2^11 - 2^1 - for i := 0; i < 9; i++ { - t.Square(&t) // 2^20 - 2^10 - } - z2_20_0.Multiply(&t, &z2_10_0) // 2^20 - 2^0 - - t.Square(&z2_20_0) // 2^21 - 2^1 - for i := 0; i < 19; i++ { - t.Square(&t) // 2^40 - 2^20 - } - t.Multiply(&t, &z2_20_0) // 2^40 - 2^0 - - t.Square(&t) // 2^41 - 2^1 - for i := 0; i < 9; i++ { - t.Square(&t) // 2^50 - 2^10 - } - z2_50_0.Multiply(&t, &z2_10_0) // 2^50 - 2^0 - - t.Square(&z2_50_0) // 2^51 - 2^1 - for i := 0; i < 49; i++ { - t.Square(&t) // 2^100 - 2^50 - } - z2_100_0.Multiply(&t, &z2_50_0) // 2^100 - 2^0 - - t.Square(&z2_100_0) // 2^101 - 2^1 - for i := 0; i < 99; i++ { - t.Square(&t) // 2^200 - 2^100 - } - t.Multiply(&t, &z2_100_0) // 2^200 - 2^0 - - t.Square(&t) // 2^201 - 2^1 - for i := 0; i < 49; i++ { - t.Square(&t) // 2^250 - 2^50 - } - t.Multiply(&t, &z2_50_0) // 2^250 - 2^0 - - t.Square(&t) // 2^251 - 2^1 - t.Square(&t) // 2^252 - 2^2 - t.Square(&t) // 2^253 - 2^3 - t.Square(&t) // 2^254 - 2^4 - t.Square(&t) // 2^255 - 2^5 - - return v.Multiply(&t, &z11) // 2^255 - 21 -} - -// Set sets v = a, and returns v. -func (v *Element) Set(a *Element) *Element { - *v = *a - return v -} - -// SetBytes sets v to x, which must be a 32-byte little-endian encoding. -// -// Consistent with RFC 7748, the most significant bit (the high bit of the -// last byte) is ignored, and non-canonical values (2^255-19 through 2^255-1) -// are accepted. Note that this is laxer than specified by RFC 8032. -func (v *Element) SetBytes(x []byte) *Element { - if len(x) != 32 { - panic("edwards25519: invalid field element input size") - } - - // Bits 0:51 (bytes 0:8, bits 0:64, shift 0, mask 51). - v.l0 = binary.LittleEndian.Uint64(x[0:8]) - v.l0 &= maskLow51Bits - // Bits 51:102 (bytes 6:14, bits 48:112, shift 3, mask 51). - v.l1 = binary.LittleEndian.Uint64(x[6:14]) >> 3 - v.l1 &= maskLow51Bits - // Bits 102:153 (bytes 12:20, bits 96:160, shift 6, mask 51). - v.l2 = binary.LittleEndian.Uint64(x[12:20]) >> 6 - v.l2 &= maskLow51Bits - // Bits 153:204 (bytes 19:27, bits 152:216, shift 1, mask 51). - v.l3 = binary.LittleEndian.Uint64(x[19:27]) >> 1 - v.l3 &= maskLow51Bits - // Bits 204:251 (bytes 24:32, bits 192:256, shift 12, mask 51). - // Note: not bytes 25:33, shift 4, to avoid overread. - v.l4 = binary.LittleEndian.Uint64(x[24:32]) >> 12 - v.l4 &= maskLow51Bits - - return v -} - -// Bytes returns the canonical 32-byte little-endian encoding of v. -func (v *Element) Bytes() []byte { - // This function is outlined to make the allocations inline in the caller - // rather than happen on the heap. - var out [32]byte - return v.bytes(&out) -} - -func (v *Element) bytes(out *[32]byte) []byte { - t := *v - t.reduce() - - var buf [8]byte - for i, l := range [5]uint64{t.l0, t.l1, t.l2, t.l3, t.l4} { - bitsOffset := i * 51 - binary.LittleEndian.PutUint64(buf[:], l<= len(out) { - break - } - out[off] |= bb - } - } - - return out[:] -} - -// Equal returns 1 if v and u are equal, and 0 otherwise. -func (v *Element) Equal(u *Element) int { - sa, sv := u.Bytes(), v.Bytes() - return subtle.ConstantTimeCompare(sa, sv) -} - -// mask64Bits returns 0xffffffff if cond is 1, and 0 otherwise. -func mask64Bits(cond int) uint64 { return ^(uint64(cond) - 1) } - -// Select sets v to a if cond == 1, and to b if cond == 0. -func (v *Element) Select(a, b *Element, cond int) *Element { - m := mask64Bits(cond) - v.l0 = (m & a.l0) | (^m & b.l0) - v.l1 = (m & a.l1) | (^m & b.l1) - v.l2 = (m & a.l2) | (^m & b.l2) - v.l3 = (m & a.l3) | (^m & b.l3) - v.l4 = (m & a.l4) | (^m & b.l4) - return v -} - -// Swap swaps v and u if cond == 1 or leaves them unchanged if cond == 0, and returns v. -func (v *Element) Swap(u *Element, cond int) { - m := mask64Bits(cond) - t := m & (v.l0 ^ u.l0) - v.l0 ^= t - u.l0 ^= t - t = m & (v.l1 ^ u.l1) - v.l1 ^= t - u.l1 ^= t - t = m & (v.l2 ^ u.l2) - v.l2 ^= t - u.l2 ^= t - t = m & (v.l3 ^ u.l3) - v.l3 ^= t - u.l3 ^= t - t = m & (v.l4 ^ u.l4) - v.l4 ^= t - u.l4 ^= t -} - -// IsNegative returns 1 if v is negative, and 0 otherwise. -func (v *Element) IsNegative() int { - return int(v.Bytes()[0] & 1) -} - -// Absolute sets v to |u|, and returns v. -func (v *Element) Absolute(u *Element) *Element { - return v.Select(new(Element).Negate(u), u, u.IsNegative()) -} - -// Multiply sets v = x * y, and returns v. -func (v *Element) Multiply(x, y *Element) *Element { - feMul(v, x, y) - return v -} - -// Square sets v = x * x, and returns v. -func (v *Element) Square(x *Element) *Element { - feSquare(v, x) - return v -} - -// Mult32 sets v = x * y, and returns v. -func (v *Element) Mult32(x *Element, y uint32) *Element { - x0lo, x0hi := mul51(x.l0, y) - x1lo, x1hi := mul51(x.l1, y) - x2lo, x2hi := mul51(x.l2, y) - x3lo, x3hi := mul51(x.l3, y) - x4lo, x4hi := mul51(x.l4, y) - v.l0 = x0lo + 19*x4hi // carried over per the reduction identity - v.l1 = x1lo + x0hi - v.l2 = x2lo + x1hi - v.l3 = x3lo + x2hi - v.l4 = x4lo + x3hi - // The hi portions are going to be only 32 bits, plus any previous excess, - // so we can skip the carry propagation. - return v -} - -// mul51 returns lo + hi * 2⁵¹ = a * b. -func mul51(a uint64, b uint32) (lo uint64, hi uint64) { - mh, ml := bits.Mul64(a, uint64(b)) - lo = ml & maskLow51Bits - hi = (mh << 13) | (ml >> 51) - return -} - -// Pow22523 set v = x^((p-5)/8), and returns v. (p-5)/8 is 2^252-3. -func (v *Element) Pow22523(x *Element) *Element { - var t0, t1, t2 Element - - t0.Square(x) // x^2 - t1.Square(&t0) // x^4 - t1.Square(&t1) // x^8 - t1.Multiply(x, &t1) // x^9 - t0.Multiply(&t0, &t1) // x^11 - t0.Square(&t0) // x^22 - t0.Multiply(&t1, &t0) // x^31 - t1.Square(&t0) // x^62 - for i := 1; i < 5; i++ { // x^992 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // x^1023 -> 1023 = 2^10 - 1 - t1.Square(&t0) // 2^11 - 2 - for i := 1; i < 10; i++ { // 2^20 - 2^10 - t1.Square(&t1) - } - t1.Multiply(&t1, &t0) // 2^20 - 1 - t2.Square(&t1) // 2^21 - 2 - for i := 1; i < 20; i++ { // 2^40 - 2^20 - t2.Square(&t2) - } - t1.Multiply(&t2, &t1) // 2^40 - 1 - t1.Square(&t1) // 2^41 - 2 - for i := 1; i < 10; i++ { // 2^50 - 2^10 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // 2^50 - 1 - t1.Square(&t0) // 2^51 - 2 - for i := 1; i < 50; i++ { // 2^100 - 2^50 - t1.Square(&t1) - } - t1.Multiply(&t1, &t0) // 2^100 - 1 - t2.Square(&t1) // 2^101 - 2 - for i := 1; i < 100; i++ { // 2^200 - 2^100 - t2.Square(&t2) - } - t1.Multiply(&t2, &t1) // 2^200 - 1 - t1.Square(&t1) // 2^201 - 2 - for i := 1; i < 50; i++ { // 2^250 - 2^50 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // 2^250 - 1 - t0.Square(&t0) // 2^251 - 2 - t0.Square(&t0) // 2^252 - 4 - return v.Multiply(&t0, x) // 2^252 - 3 -> x^(2^252-3) -} - -// sqrtM1 is 2^((p-1)/4), which squared is equal to -1 by Euler's Criterion. -var sqrtM1 = &Element{1718705420411056, 234908883556509, - 2233514472574048, 2117202627021982, 765476049583133} - -// SqrtRatio sets r to the non-negative square root of the ratio of u and v. -// -// If u/v is square, SqrtRatio returns r and 1. If u/v is not square, SqrtRatio -// sets r according to Section 4.3 of draft-irtf-cfrg-ristretto255-decaf448-00, -// and returns r and 0. -func (r *Element) SqrtRatio(u, v *Element) (rr *Element, wasSquare int) { - var a, b Element - - // r = (u * v3) * (u * v7)^((p-5)/8) - v2 := a.Square(v) - uv3 := b.Multiply(u, b.Multiply(v2, v)) - uv7 := a.Multiply(uv3, a.Square(v2)) - r.Multiply(uv3, r.Pow22523(uv7)) - - check := a.Multiply(v, a.Square(r)) // check = v * r^2 - - uNeg := b.Negate(u) - correctSignSqrt := check.Equal(u) - flippedSignSqrt := check.Equal(uNeg) - flippedSignSqrtI := check.Equal(uNeg.Multiply(uNeg, sqrtM1)) - - rPrime := b.Multiply(r, sqrtM1) // r_prime = SQRT_M1 * r - // r = CT_SELECT(r_prime IF flipped_sign_sqrt | flipped_sign_sqrt_i ELSE r) - r.Select(rPrime, r, flippedSignSqrt|flippedSignSqrtI) - - r.Absolute(r) // Choose the nonnegative square root. - return r, correctSignSqrt | flippedSignSqrt -} diff --git a/curve25519/internal/field/fe_alias_test.go b/curve25519/internal/field/fe_alias_test.go deleted file mode 100644 index 64e57c4f35..0000000000 --- a/curve25519/internal/field/fe_alias_test.go +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import ( - "testing" - "testing/quick" -) - -func checkAliasingOneArg(f func(v, x *Element) *Element) func(v, x Element) bool { - return func(v, x Element) bool { - x1, v1 := x, x - - // Calculate a reference f(x) without aliasing. - if out := f(&v, &x); out != &v && isInBounds(out) { - return false - } - - // Test aliasing the argument and the receiver. - if out := f(&v1, &v1); out != &v1 || v1 != v { - return false - } - - // Ensure the arguments was not modified. - return x == x1 - } -} - -func checkAliasingTwoArgs(f func(v, x, y *Element) *Element) func(v, x, y Element) bool { - return func(v, x, y Element) bool { - x1, y1, v1 := x, y, Element{} - - // Calculate a reference f(x, y) without aliasing. - if out := f(&v, &x, &y); out != &v && isInBounds(out) { - return false - } - - // Test aliasing the first argument and the receiver. - v1 = x - if out := f(&v1, &v1, &y); out != &v1 || v1 != v { - return false - } - // Test aliasing the second argument and the receiver. - v1 = y - if out := f(&v1, &x, &v1); out != &v1 || v1 != v { - return false - } - - // Calculate a reference f(x, x) without aliasing. - if out := f(&v, &x, &x); out != &v { - return false - } - - // Test aliasing the first argument and the receiver. - v1 = x - if out := f(&v1, &v1, &x); out != &v1 || v1 != v { - return false - } - // Test aliasing the second argument and the receiver. - v1 = x - if out := f(&v1, &x, &v1); out != &v1 || v1 != v { - return false - } - // Test aliasing both arguments and the receiver. - v1 = x - if out := f(&v1, &v1, &v1); out != &v1 || v1 != v { - return false - } - - // Ensure the arguments were not modified. - return x == x1 && y == y1 - } -} - -// TestAliasing checks that receivers and arguments can alias each other without -// leading to incorrect results. That is, it ensures that it's safe to write -// -// v.Invert(v) -// -// or -// -// v.Add(v, v) -// -// without any of the inputs getting clobbered by the output being written. -func TestAliasing(t *testing.T) { - type target struct { - name string - oneArgF func(v, x *Element) *Element - twoArgsF func(v, x, y *Element) *Element - } - for _, tt := range []target{ - {name: "Absolute", oneArgF: (*Element).Absolute}, - {name: "Invert", oneArgF: (*Element).Invert}, - {name: "Negate", oneArgF: (*Element).Negate}, - {name: "Set", oneArgF: (*Element).Set}, - {name: "Square", oneArgF: (*Element).Square}, - {name: "Multiply", twoArgsF: (*Element).Multiply}, - {name: "Add", twoArgsF: (*Element).Add}, - {name: "Subtract", twoArgsF: (*Element).Subtract}, - { - name: "Select0", - twoArgsF: func(v, x, y *Element) *Element { - return (*Element).Select(v, x, y, 0) - }, - }, - { - name: "Select1", - twoArgsF: func(v, x, y *Element) *Element { - return (*Element).Select(v, x, y, 1) - }, - }, - } { - var err error - switch { - case tt.oneArgF != nil: - err = quick.Check(checkAliasingOneArg(tt.oneArgF), &quick.Config{MaxCountScale: 1 << 8}) - case tt.twoArgsF != nil: - err = quick.Check(checkAliasingTwoArgs(tt.twoArgsF), &quick.Config{MaxCountScale: 1 << 8}) - } - if err != nil { - t.Errorf("%v: %v", tt.name, err) - } - } -} diff --git a/curve25519/internal/field/fe_amd64.go b/curve25519/internal/field/fe_amd64.go deleted file mode 100644 index 70c541692c..0000000000 --- a/curve25519/internal/field/fe_amd64.go +++ /dev/null @@ -1,15 +0,0 @@ -// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -package field - -// feMul sets out = a * b. It works like feMulGeneric. -// -//go:noescape -func feMul(out *Element, a *Element, b *Element) - -// feSquare sets out = a * a. It works like feSquareGeneric. -// -//go:noescape -func feSquare(out *Element, a *Element) diff --git a/curve25519/internal/field/fe_amd64.s b/curve25519/internal/field/fe_amd64.s deleted file mode 100644 index 60817acc41..0000000000 --- a/curve25519/internal/field/fe_amd64.s +++ /dev/null @@ -1,378 +0,0 @@ -// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -#include "textflag.h" - -// func feMul(out *Element, a *Element, b *Element) -TEXT ·feMul(SB), NOSPLIT, $0-24 - MOVQ a+8(FP), CX - MOVQ b+16(FP), BX - - // r0 = a0×b0 - MOVQ (CX), AX - MULQ (BX) - MOVQ AX, DI - MOVQ DX, SI - - // r0 += 19×a1×b4 - MOVQ 8(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a2×b3 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a3×b2 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a4×b1 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 8(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r1 = a0×b1 - MOVQ (CX), AX - MULQ 8(BX) - MOVQ AX, R9 - MOVQ DX, R8 - - // r1 += a1×b0 - MOVQ 8(CX), AX - MULQ (BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a2×b4 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a3×b3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a4×b2 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r2 = a0×b2 - MOVQ (CX), AX - MULQ 16(BX) - MOVQ AX, R11 - MOVQ DX, R10 - - // r2 += a1×b1 - MOVQ 8(CX), AX - MULQ 8(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += a2×b0 - MOVQ 16(CX), AX - MULQ (BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += 19×a3×b4 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += 19×a4×b3 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r3 = a0×b3 - MOVQ (CX), AX - MULQ 24(BX) - MOVQ AX, R13 - MOVQ DX, R12 - - // r3 += a1×b2 - MOVQ 8(CX), AX - MULQ 16(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += a2×b1 - MOVQ 16(CX), AX - MULQ 8(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += a3×b0 - MOVQ 24(CX), AX - MULQ (BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += 19×a4×b4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r4 = a0×b4 - MOVQ (CX), AX - MULQ 32(BX) - MOVQ AX, R15 - MOVQ DX, R14 - - // r4 += a1×b3 - MOVQ 8(CX), AX - MULQ 24(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a2×b2 - MOVQ 16(CX), AX - MULQ 16(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a3×b1 - MOVQ 24(CX), AX - MULQ 8(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a4×b0 - MOVQ 32(CX), AX - MULQ (BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // First reduction chain - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, DI, SI - SHLQ $0x0d, R9, R8 - SHLQ $0x0d, R11, R10 - SHLQ $0x0d, R13, R12 - SHLQ $0x0d, R15, R14 - ANDQ AX, DI - IMUL3Q $0x13, R14, R14 - ADDQ R14, DI - ANDQ AX, R9 - ADDQ SI, R9 - ANDQ AX, R11 - ADDQ R8, R11 - ANDQ AX, R13 - ADDQ R10, R13 - ANDQ AX, R15 - ADDQ R12, R15 - - // Second reduction chain (carryPropagate) - MOVQ DI, SI - SHRQ $0x33, SI - MOVQ R9, R8 - SHRQ $0x33, R8 - MOVQ R11, R10 - SHRQ $0x33, R10 - MOVQ R13, R12 - SHRQ $0x33, R12 - MOVQ R15, R14 - SHRQ $0x33, R14 - ANDQ AX, DI - IMUL3Q $0x13, R14, R14 - ADDQ R14, DI - ANDQ AX, R9 - ADDQ SI, R9 - ANDQ AX, R11 - ADDQ R8, R11 - ANDQ AX, R13 - ADDQ R10, R13 - ANDQ AX, R15 - ADDQ R12, R15 - - // Store output - MOVQ out+0(FP), AX - MOVQ DI, (AX) - MOVQ R9, 8(AX) - MOVQ R11, 16(AX) - MOVQ R13, 24(AX) - MOVQ R15, 32(AX) - RET - -// func feSquare(out *Element, a *Element) -TEXT ·feSquare(SB), NOSPLIT, $0-16 - MOVQ a+8(FP), CX - - // r0 = l0×l0 - MOVQ (CX), AX - MULQ (CX) - MOVQ AX, SI - MOVQ DX, BX - - // r0 += 38×l1×l4 - MOVQ 8(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, SI - ADCQ DX, BX - - // r0 += 38×l2×l3 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 24(CX) - ADDQ AX, SI - ADCQ DX, BX - - // r1 = 2×l0×l1 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 8(CX) - MOVQ AX, R8 - MOVQ DX, DI - - // r1 += 38×l2×l4 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R8 - ADCQ DX, DI - - // r1 += 19×l3×l3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(CX) - ADDQ AX, R8 - ADCQ DX, DI - - // r2 = 2×l0×l2 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 16(CX) - MOVQ AX, R10 - MOVQ DX, R9 - - // r2 += l1×l1 - MOVQ 8(CX), AX - MULQ 8(CX) - ADDQ AX, R10 - ADCQ DX, R9 - - // r2 += 38×l3×l4 - MOVQ 24(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R10 - ADCQ DX, R9 - - // r3 = 2×l0×l3 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 24(CX) - MOVQ AX, R12 - MOVQ DX, R11 - - // r3 += 2×l1×l2 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 16(CX) - ADDQ AX, R12 - ADCQ DX, R11 - - // r3 += 19×l4×l4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(CX) - ADDQ AX, R12 - ADCQ DX, R11 - - // r4 = 2×l0×l4 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 32(CX) - MOVQ AX, R14 - MOVQ DX, R13 - - // r4 += 2×l1×l3 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 24(CX) - ADDQ AX, R14 - ADCQ DX, R13 - - // r4 += l2×l2 - MOVQ 16(CX), AX - MULQ 16(CX) - ADDQ AX, R14 - ADCQ DX, R13 - - // First reduction chain - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, SI, BX - SHLQ $0x0d, R8, DI - SHLQ $0x0d, R10, R9 - SHLQ $0x0d, R12, R11 - SHLQ $0x0d, R14, R13 - ANDQ AX, SI - IMUL3Q $0x13, R13, R13 - ADDQ R13, SI - ANDQ AX, R8 - ADDQ BX, R8 - ANDQ AX, R10 - ADDQ DI, R10 - ANDQ AX, R12 - ADDQ R9, R12 - ANDQ AX, R14 - ADDQ R11, R14 - - // Second reduction chain (carryPropagate) - MOVQ SI, BX - SHRQ $0x33, BX - MOVQ R8, DI - SHRQ $0x33, DI - MOVQ R10, R9 - SHRQ $0x33, R9 - MOVQ R12, R11 - SHRQ $0x33, R11 - MOVQ R14, R13 - SHRQ $0x33, R13 - ANDQ AX, SI - IMUL3Q $0x13, R13, R13 - ADDQ R13, SI - ANDQ AX, R8 - ADDQ BX, R8 - ANDQ AX, R10 - ADDQ DI, R10 - ANDQ AX, R12 - ADDQ R9, R12 - ANDQ AX, R14 - ADDQ R11, R14 - - // Store output - MOVQ out+0(FP), AX - MOVQ SI, (AX) - MOVQ R8, 8(AX) - MOVQ R10, 16(AX) - MOVQ R12, 24(AX) - MOVQ R14, 32(AX) - RET diff --git a/curve25519/internal/field/fe_amd64_noasm.go b/curve25519/internal/field/fe_amd64_noasm.go deleted file mode 100644 index 9da280d1d8..0000000000 --- a/curve25519/internal/field/fe_amd64_noasm.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !amd64 || !gc || purego - -package field - -func feMul(v, x, y *Element) { feMulGeneric(v, x, y) } - -func feSquare(v, x *Element) { feSquareGeneric(v, x) } diff --git a/curve25519/internal/field/fe_arm64.go b/curve25519/internal/field/fe_arm64.go deleted file mode 100644 index 075fe9b925..0000000000 --- a/curve25519/internal/field/fe_arm64.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego - -package field - -//go:noescape -func carryPropagate(v *Element) - -func (v *Element) carryPropagate() *Element { - carryPropagate(v) - return v -} diff --git a/curve25519/internal/field/fe_arm64.s b/curve25519/internal/field/fe_arm64.s deleted file mode 100644 index 3126a43419..0000000000 --- a/curve25519/internal/field/fe_arm64.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego - -#include "textflag.h" - -// carryPropagate works exactly like carryPropagateGeneric and uses the -// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but -// avoids loading R0-R4 twice and uses LDP and STP. -// -// See https://golang.org/issues/43145 for the main compiler issue. -// -// func carryPropagate(v *Element) -TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8 - MOVD v+0(FP), R20 - - LDP 0(R20), (R0, R1) - LDP 16(R20), (R2, R3) - MOVD 32(R20), R4 - - AND $0x7ffffffffffff, R0, R10 - AND $0x7ffffffffffff, R1, R11 - AND $0x7ffffffffffff, R2, R12 - AND $0x7ffffffffffff, R3, R13 - AND $0x7ffffffffffff, R4, R14 - - ADD R0>>51, R11, R11 - ADD R1>>51, R12, R12 - ADD R2>>51, R13, R13 - ADD R3>>51, R14, R14 - // R4>>51 * 19 + R10 -> R10 - LSR $51, R4, R21 - MOVD $19, R22 - MADD R22, R10, R21, R10 - - STP (R10, R11), 0(R20) - STP (R12, R13), 16(R20) - MOVD R14, 32(R20) - - RET diff --git a/curve25519/internal/field/fe_arm64_noasm.go b/curve25519/internal/field/fe_arm64_noasm.go deleted file mode 100644 index fc029ac12d..0000000000 --- a/curve25519/internal/field/fe_arm64_noasm.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !arm64 || !gc || purego - -package field - -func (v *Element) carryPropagate() *Element { - return v.carryPropagateGeneric() -} diff --git a/curve25519/internal/field/fe_bench_test.go b/curve25519/internal/field/fe_bench_test.go deleted file mode 100644 index 77dc06cf98..0000000000 --- a/curve25519/internal/field/fe_bench_test.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import "testing" - -func BenchmarkAdd(b *testing.B) { - var x, y Element - x.One() - y.Add(feOne, feOne) - b.ResetTimer() - for i := 0; i < b.N; i++ { - x.Add(&x, &y) - } -} - -func BenchmarkMultiply(b *testing.B) { - var x, y Element - x.One() - y.Add(feOne, feOne) - b.ResetTimer() - for i := 0; i < b.N; i++ { - x.Multiply(&x, &y) - } -} - -func BenchmarkMult32(b *testing.B) { - var x Element - x.One() - b.ResetTimer() - for i := 0; i < b.N; i++ { - x.Mult32(&x, 0xaa42aa42) - } -} diff --git a/curve25519/internal/field/fe_generic.go b/curve25519/internal/field/fe_generic.go deleted file mode 100644 index 2671217da5..0000000000 --- a/curve25519/internal/field/fe_generic.go +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import "math/bits" - -// uint128 holds a 128-bit number as two 64-bit limbs, for use with the -// bits.Mul64 and bits.Add64 intrinsics. -type uint128 struct { - lo, hi uint64 -} - -// mul64 returns a * b. -func mul64(a, b uint64) uint128 { - hi, lo := bits.Mul64(a, b) - return uint128{lo, hi} -} - -// addMul64 returns v + a * b. -func addMul64(v uint128, a, b uint64) uint128 { - hi, lo := bits.Mul64(a, b) - lo, c := bits.Add64(lo, v.lo, 0) - hi, _ = bits.Add64(hi, v.hi, c) - return uint128{lo, hi} -} - -// shiftRightBy51 returns a >> 51. a is assumed to be at most 115 bits. -func shiftRightBy51(a uint128) uint64 { - return (a.hi << (64 - 51)) | (a.lo >> 51) -} - -func feMulGeneric(v, a, b *Element) { - a0 := a.l0 - a1 := a.l1 - a2 := a.l2 - a3 := a.l3 - a4 := a.l4 - - b0 := b.l0 - b1 := b.l1 - b2 := b.l2 - b3 := b.l3 - b4 := b.l4 - - // Limb multiplication works like pen-and-paper columnar multiplication, but - // with 51-bit limbs instead of digits. - // - // a4 a3 a2 a1 a0 x - // b4 b3 b2 b1 b0 = - // ------------------------ - // a4b0 a3b0 a2b0 a1b0 a0b0 + - // a4b1 a3b1 a2b1 a1b1 a0b1 + - // a4b2 a3b2 a2b2 a1b2 a0b2 + - // a4b3 a3b3 a2b3 a1b3 a0b3 + - // a4b4 a3b4 a2b4 a1b4 a0b4 = - // ---------------------------------------------- - // r8 r7 r6 r5 r4 r3 r2 r1 r0 - // - // We can then use the reduction identity (a * 2²⁵⁵ + b = a * 19 + b) to - // reduce the limbs that would overflow 255 bits. r5 * 2²⁵⁵ becomes 19 * r5, - // r6 * 2³⁰⁶ becomes 19 * r6 * 2⁵¹, etc. - // - // Reduction can be carried out simultaneously to multiplication. For - // example, we do not compute r5: whenever the result of a multiplication - // belongs to r5, like a1b4, we multiply it by 19 and add the result to r0. - // - // a4b0 a3b0 a2b0 a1b0 a0b0 + - // a3b1 a2b1 a1b1 a0b1 19×a4b1 + - // a2b2 a1b2 a0b2 19×a4b2 19×a3b2 + - // a1b3 a0b3 19×a4b3 19×a3b3 19×a2b3 + - // a0b4 19×a4b4 19×a3b4 19×a2b4 19×a1b4 = - // -------------------------------------- - // r4 r3 r2 r1 r0 - // - // Finally we add up the columns into wide, overlapping limbs. - - a1_19 := a1 * 19 - a2_19 := a2 * 19 - a3_19 := a3 * 19 - a4_19 := a4 * 19 - - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - r0 := mul64(a0, b0) - r0 = addMul64(r0, a1_19, b4) - r0 = addMul64(r0, a2_19, b3) - r0 = addMul64(r0, a3_19, b2) - r0 = addMul64(r0, a4_19, b1) - - // r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2) - r1 := mul64(a0, b1) - r1 = addMul64(r1, a1, b0) - r1 = addMul64(r1, a2_19, b4) - r1 = addMul64(r1, a3_19, b3) - r1 = addMul64(r1, a4_19, b2) - - // r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3) - r2 := mul64(a0, b2) - r2 = addMul64(r2, a1, b1) - r2 = addMul64(r2, a2, b0) - r2 = addMul64(r2, a3_19, b4) - r2 = addMul64(r2, a4_19, b3) - - // r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4 - r3 := mul64(a0, b3) - r3 = addMul64(r3, a1, b2) - r3 = addMul64(r3, a2, b1) - r3 = addMul64(r3, a3, b0) - r3 = addMul64(r3, a4_19, b4) - - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - r4 := mul64(a0, b4) - r4 = addMul64(r4, a1, b3) - r4 = addMul64(r4, a2, b2) - r4 = addMul64(r4, a3, b1) - r4 = addMul64(r4, a4, b0) - - // After the multiplication, we need to reduce (carry) the five coefficients - // to obtain a result with limbs that are at most slightly larger than 2⁵¹, - // to respect the Element invariant. - // - // Overall, the reduction works the same as carryPropagate, except with - // wider inputs: we take the carry for each coefficient by shifting it right - // by 51, and add it to the limb above it. The top carry is multiplied by 19 - // according to the reduction identity and added to the lowest limb. - // - // The largest coefficient (r0) will be at most 111 bits, which guarantees - // that all carries are at most 111 - 51 = 60 bits, which fits in a uint64. - // - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - // r0 < 2⁵²×2⁵² + 19×(2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵²) - // r0 < (1 + 19 × 4) × 2⁵² × 2⁵² - // r0 < 2⁷ × 2⁵² × 2⁵² - // r0 < 2¹¹¹ - // - // Moreover, the top coefficient (r4) is at most 107 bits, so c4 is at most - // 56 bits, and c4 * 19 is at most 61 bits, which again fits in a uint64 and - // allows us to easily apply the reduction identity. - // - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - // r4 < 5 × 2⁵² × 2⁵² - // r4 < 2¹⁰⁷ - // - - c0 := shiftRightBy51(r0) - c1 := shiftRightBy51(r1) - c2 := shiftRightBy51(r2) - c3 := shiftRightBy51(r3) - c4 := shiftRightBy51(r4) - - rr0 := r0.lo&maskLow51Bits + c4*19 - rr1 := r1.lo&maskLow51Bits + c0 - rr2 := r2.lo&maskLow51Bits + c1 - rr3 := r3.lo&maskLow51Bits + c2 - rr4 := r4.lo&maskLow51Bits + c3 - - // Now all coefficients fit into 64-bit registers but are still too large to - // be passed around as a Element. We therefore do one last carry chain, - // where the carries will be small enough to fit in the wiggle room above 2⁵¹. - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() -} - -func feSquareGeneric(v, a *Element) { - l0 := a.l0 - l1 := a.l1 - l2 := a.l2 - l3 := a.l3 - l4 := a.l4 - - // Squaring works precisely like multiplication above, but thanks to its - // symmetry we get to group a few terms together. - // - // l4 l3 l2 l1 l0 x - // l4 l3 l2 l1 l0 = - // ------------------------ - // l4l0 l3l0 l2l0 l1l0 l0l0 + - // l4l1 l3l1 l2l1 l1l1 l0l1 + - // l4l2 l3l2 l2l2 l1l2 l0l2 + - // l4l3 l3l3 l2l3 l1l3 l0l3 + - // l4l4 l3l4 l2l4 l1l4 l0l4 = - // ---------------------------------------------- - // r8 r7 r6 r5 r4 r3 r2 r1 r0 - // - // l4l0 l3l0 l2l0 l1l0 l0l0 + - // l3l1 l2l1 l1l1 l0l1 19×l4l1 + - // l2l2 l1l2 l0l2 19×l4l2 19×l3l2 + - // l1l3 l0l3 19×l4l3 19×l3l3 19×l2l3 + - // l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4 = - // -------------------------------------- - // r4 r3 r2 r1 r0 - // - // With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with - // only three Mul64 and four Add64, instead of five and eight. - - l0_2 := l0 * 2 - l1_2 := l1 * 2 - - l1_38 := l1 * 38 - l2_38 := l2 * 38 - l3_38 := l3 * 38 - - l3_19 := l3 * 19 - l4_19 := l4 * 19 - - // r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3) - r0 := mul64(l0, l0) - r0 = addMul64(r0, l1_38, l4) - r0 = addMul64(r0, l2_38, l3) - - // r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3 - r1 := mul64(l0_2, l1) - r1 = addMul64(r1, l2_38, l4) - r1 = addMul64(r1, l3_19, l3) - - // r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4 - r2 := mul64(l0_2, l2) - r2 = addMul64(r2, l1, l1) - r2 = addMul64(r2, l3_38, l4) - - // r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4 - r3 := mul64(l0_2, l3) - r3 = addMul64(r3, l1_2, l2) - r3 = addMul64(r3, l4_19, l4) - - // r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2 - r4 := mul64(l0_2, l4) - r4 = addMul64(r4, l1_2, l3) - r4 = addMul64(r4, l2, l2) - - c0 := shiftRightBy51(r0) - c1 := shiftRightBy51(r1) - c2 := shiftRightBy51(r2) - c3 := shiftRightBy51(r3) - c4 := shiftRightBy51(r4) - - rr0 := r0.lo&maskLow51Bits + c4*19 - rr1 := r1.lo&maskLow51Bits + c0 - rr2 := r2.lo&maskLow51Bits + c1 - rr3 := r3.lo&maskLow51Bits + c2 - rr4 := r4.lo&maskLow51Bits + c3 - - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() -} - -// carryPropagateGeneric brings the limbs below 52 bits by applying the reduction -// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry. TODO inline -func (v *Element) carryPropagateGeneric() *Element { - c0 := v.l0 >> 51 - c1 := v.l1 >> 51 - c2 := v.l2 >> 51 - c3 := v.l3 >> 51 - c4 := v.l4 >> 51 - - v.l0 = v.l0&maskLow51Bits + c4*19 - v.l1 = v.l1&maskLow51Bits + c0 - v.l2 = v.l2&maskLow51Bits + c1 - v.l3 = v.l3&maskLow51Bits + c2 - v.l4 = v.l4&maskLow51Bits + c3 - - return v -} diff --git a/curve25519/internal/field/fe_test.go b/curve25519/internal/field/fe_test.go deleted file mode 100644 index b484459ff2..0000000000 --- a/curve25519/internal/field/fe_test.go +++ /dev/null @@ -1,558 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import ( - "bytes" - "crypto/rand" - "encoding/hex" - "io" - "math/big" - "math/bits" - mathrand "math/rand" - "reflect" - "testing" - "testing/quick" -) - -func (v Element) String() string { - return hex.EncodeToString(v.Bytes()) -} - -// quickCheckConfig1024 will make each quickcheck test run (1024 * -quickchecks) -// times. The default value of -quickchecks is 100. -var quickCheckConfig1024 = &quick.Config{MaxCountScale: 1 << 10} - -func generateFieldElement(rand *mathrand.Rand) Element { - const maskLow52Bits = (1 << 52) - 1 - return Element{ - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - } -} - -// weirdLimbs can be combined to generate a range of edge-case field elements. -// 0 and -1 are intentionally more weighted, as they combine well. -var ( - weirdLimbs51 = []uint64{ - 0, 0, 0, 0, - 1, - 19 - 1, - 19, - 0x2aaaaaaaaaaaa, - 0x5555555555555, - (1 << 51) - 20, - (1 << 51) - 19, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - } - weirdLimbs52 = []uint64{ - 0, 0, 0, 0, 0, 0, - 1, - 19 - 1, - 19, - 0x2aaaaaaaaaaaa, - 0x5555555555555, - (1 << 51) - 20, - (1 << 51) - 19, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - 1 << 51, - (1 << 51) + 1, - (1 << 52) - 19, - (1 << 52) - 1, - } -) - -func generateWeirdFieldElement(rand *mathrand.Rand) Element { - return Element{ - weirdLimbs52[rand.Intn(len(weirdLimbs52))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - } -} - -func (Element) Generate(rand *mathrand.Rand, size int) reflect.Value { - if rand.Intn(2) == 0 { - return reflect.ValueOf(generateWeirdFieldElement(rand)) - } - return reflect.ValueOf(generateFieldElement(rand)) -} - -// isInBounds returns whether the element is within the expected bit size bounds -// after a light reduction. -func isInBounds(x *Element) bool { - return bits.Len64(x.l0) <= 52 && - bits.Len64(x.l1) <= 52 && - bits.Len64(x.l2) <= 52 && - bits.Len64(x.l3) <= 52 && - bits.Len64(x.l4) <= 52 -} - -func TestMultiplyDistributesOverAdd(t *testing.T) { - multiplyDistributesOverAdd := func(x, y, z Element) bool { - // Compute t1 = (x+y)*z - t1 := new(Element) - t1.Add(&x, &y) - t1.Multiply(t1, &z) - - // Compute t2 = x*z + y*z - t2 := new(Element) - t3 := new(Element) - t2.Multiply(&x, &z) - t3.Multiply(&y, &z) - t2.Add(t2, t3) - - return t1.Equal(t2) == 1 && isInBounds(t1) && isInBounds(t2) - } - - if err := quick.Check(multiplyDistributesOverAdd, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func TestMul64to128(t *testing.T) { - a := uint64(5) - b := uint64(5) - r := mul64(a, b) - if r.lo != 0x19 || r.hi != 0 { - t.Errorf("lo-range wide mult failed, got %d + %d*(2**64)", r.lo, r.hi) - } - - a = uint64(18014398509481983) // 2^54 - 1 - b = uint64(18014398509481983) // 2^54 - 1 - r = mul64(a, b) - if r.lo != 0xff80000000000001 || r.hi != 0xfffffffffff { - t.Errorf("hi-range wide mult failed, got %d + %d*(2**64)", r.lo, r.hi) - } - - a = uint64(1125899906842661) - b = uint64(2097155) - r = mul64(a, b) - r = addMul64(r, a, b) - r = addMul64(r, a, b) - r = addMul64(r, a, b) - r = addMul64(r, a, b) - if r.lo != 16888498990613035 || r.hi != 640 { - t.Errorf("wrong answer: %d + %d*(2**64)", r.lo, r.hi) - } -} - -func TestSetBytesRoundTrip(t *testing.T) { - f1 := func(in [32]byte, fe Element) bool { - fe.SetBytes(in[:]) - - // Mask the most significant bit as it's ignored by SetBytes. (Now - // instead of earlier so we check the masking in SetBytes is working.) - in[len(in)-1] &= (1 << 7) - 1 - - return bytes.Equal(in[:], fe.Bytes()) && isInBounds(&fe) - } - if err := quick.Check(f1, nil); err != nil { - t.Errorf("failed bytes->FE->bytes round-trip: %v", err) - } - - f2 := func(fe, r Element) bool { - r.SetBytes(fe.Bytes()) - - // Intentionally not using Equal not to go through Bytes again. - // Calling reduce because both Generate and SetBytes can produce - // non-canonical representations. - fe.reduce() - r.reduce() - return fe == r - } - if err := quick.Check(f2, nil); err != nil { - t.Errorf("failed FE->bytes->FE round-trip: %v", err) - } - - // Check some fixed vectors from dalek - type feRTTest struct { - fe Element - b []byte - } - var tests = []feRTTest{ - { - fe: Element{358744748052810, 1691584618240980, 977650209285361, 1429865912637724, 560044844278676}, - b: []byte{74, 209, 69, 197, 70, 70, 161, 222, 56, 226, 229, 19, 112, 60, 25, 92, 187, 74, 222, 56, 50, 153, 51, 233, 40, 74, 57, 6, 160, 185, 213, 31}, - }, - { - fe: Element{84926274344903, 473620666599931, 365590438845504, 1028470286882429, 2146499180330972}, - b: []byte{199, 23, 106, 112, 61, 77, 216, 79, 186, 60, 11, 118, 13, 16, 103, 15, 42, 32, 83, 250, 44, 57, 204, 198, 78, 199, 253, 119, 146, 172, 3, 122}, - }, - } - - for _, tt := range tests { - b := tt.fe.Bytes() - if !bytes.Equal(b, tt.b) || new(Element).SetBytes(tt.b).Equal(&tt.fe) != 1 { - t.Errorf("Failed fixed roundtrip: %v", tt) - } - } -} - -func swapEndianness(buf []byte) []byte { - for i := 0; i < len(buf)/2; i++ { - buf[i], buf[len(buf)-i-1] = buf[len(buf)-i-1], buf[i] - } - return buf -} - -func TestBytesBigEquivalence(t *testing.T) { - f1 := func(in [32]byte, fe, fe1 Element) bool { - fe.SetBytes(in[:]) - - in[len(in)-1] &= (1 << 7) - 1 // mask the most significant bit - b := new(big.Int).SetBytes(swapEndianness(in[:])) - fe1.fromBig(b) - - if fe != fe1 { - return false - } - - buf := make([]byte, 32) // pad with zeroes - copy(buf, swapEndianness(fe1.toBig().Bytes())) - - return bytes.Equal(fe.Bytes(), buf) && isInBounds(&fe) && isInBounds(&fe1) - } - if err := quick.Check(f1, nil); err != nil { - t.Error(err) - } -} - -// fromBig sets v = n, and returns v. The bit length of n must not exceed 256. -func (v *Element) fromBig(n *big.Int) *Element { - if n.BitLen() > 32*8 { - panic("edwards25519: invalid field element input size") - } - - buf := make([]byte, 0, 32) - for _, word := range n.Bits() { - for i := 0; i < bits.UintSize; i += 8 { - if len(buf) >= cap(buf) { - break - } - buf = append(buf, byte(word)) - word >>= 8 - } - } - - return v.SetBytes(buf[:32]) -} - -func (v *Element) fromDecimal(s string) *Element { - n, ok := new(big.Int).SetString(s, 10) - if !ok { - panic("not a valid decimal: " + s) - } - return v.fromBig(n) -} - -// toBig returns v as a big.Int. -func (v *Element) toBig() *big.Int { - buf := v.Bytes() - - words := make([]big.Word, 32*8/bits.UintSize) - for n := range words { - for i := 0; i < bits.UintSize; i += 8 { - if len(buf) == 0 { - break - } - words[n] |= big.Word(buf[0]) << big.Word(i) - buf = buf[1:] - } - } - - return new(big.Int).SetBits(words) -} - -func TestDecimalConstants(t *testing.T) { - sqrtM1String := "19681161376707505956807079304988542015446066515923890162744021073123829784752" - if exp := new(Element).fromDecimal(sqrtM1String); sqrtM1.Equal(exp) != 1 { - t.Errorf("sqrtM1 is %v, expected %v", sqrtM1, exp) - } - // d is in the parent package, and we don't want to expose d or fromDecimal. - // dString := "37095705934669439343138083508754565189542113879843219016388785533085940283555" - // if exp := new(Element).fromDecimal(dString); d.Equal(exp) != 1 { - // t.Errorf("d is %v, expected %v", d, exp) - // } -} - -func TestSetBytesRoundTripEdgeCases(t *testing.T) { - // TODO: values close to 0, close to 2^255-19, between 2^255-19 and 2^255-1, - // and between 2^255 and 2^256-1. Test both the documented SetBytes - // behavior, and that Bytes reduces them. -} - -// Tests self-consistency between Multiply and Square. -func TestConsistency(t *testing.T) { - var x Element - var x2, x2sq Element - - x = Element{1, 1, 1, 1, 1} - x2.Multiply(&x, &x) - x2sq.Square(&x) - - if x2 != x2sq { - t.Fatalf("all ones failed\nmul: %x\nsqr: %x\n", x2, x2sq) - } - - var bytes [32]byte - - _, err := io.ReadFull(rand.Reader, bytes[:]) - if err != nil { - t.Fatal(err) - } - x.SetBytes(bytes[:]) - - x2.Multiply(&x, &x) - x2sq.Square(&x) - - if x2 != x2sq { - t.Fatalf("all ones failed\nmul: %x\nsqr: %x\n", x2, x2sq) - } -} - -func TestEqual(t *testing.T) { - x := Element{1, 1, 1, 1, 1} - y := Element{5, 4, 3, 2, 1} - - eq := x.Equal(&x) - if eq != 1 { - t.Errorf("wrong about equality") - } - - eq = x.Equal(&y) - if eq != 0 { - t.Errorf("wrong about inequality") - } -} - -func TestInvert(t *testing.T) { - x := Element{1, 1, 1, 1, 1} - one := Element{1, 0, 0, 0, 0} - var xinv, r Element - - xinv.Invert(&x) - r.Multiply(&x, &xinv) - r.reduce() - - if one != r { - t.Errorf("inversion identity failed, got: %x", r) - } - - var bytes [32]byte - - _, err := io.ReadFull(rand.Reader, bytes[:]) - if err != nil { - t.Fatal(err) - } - x.SetBytes(bytes[:]) - - xinv.Invert(&x) - r.Multiply(&x, &xinv) - r.reduce() - - if one != r { - t.Errorf("random inversion identity failed, got: %x for field element %x", r, x) - } - - zero := Element{} - x.Set(&zero) - if xx := xinv.Invert(&x); xx != &xinv { - t.Errorf("inverting zero did not return the receiver") - } else if xinv.Equal(&zero) != 1 { - t.Errorf("inverting zero did not return zero") - } -} - -func TestSelectSwap(t *testing.T) { - a := Element{358744748052810, 1691584618240980, 977650209285361, 1429865912637724, 560044844278676} - b := Element{84926274344903, 473620666599931, 365590438845504, 1028470286882429, 2146499180330972} - - var c, d Element - - c.Select(&a, &b, 1) - d.Select(&a, &b, 0) - - if c.Equal(&a) != 1 || d.Equal(&b) != 1 { - t.Errorf("Select failed") - } - - c.Swap(&d, 0) - - if c.Equal(&a) != 1 || d.Equal(&b) != 1 { - t.Errorf("Swap failed") - } - - c.Swap(&d, 1) - - if c.Equal(&b) != 1 || d.Equal(&a) != 1 { - t.Errorf("Swap failed") - } -} - -func TestMult32(t *testing.T) { - mult32EquivalentToMul := func(x Element, y uint32) bool { - t1 := new(Element) - for i := 0; i < 100; i++ { - t1.Mult32(&x, y) - } - - ty := new(Element) - ty.l0 = uint64(y) - - t2 := new(Element) - for i := 0; i < 100; i++ { - t2.Multiply(&x, ty) - } - - return t1.Equal(t2) == 1 && isInBounds(t1) && isInBounds(t2) - } - - if err := quick.Check(mult32EquivalentToMul, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func TestSqrtRatio(t *testing.T) { - // From draft-irtf-cfrg-ristretto255-decaf448-00, Appendix A.4. - type test struct { - u, v string - wasSquare int - r string - } - var tests = []test{ - // If u is 0, the function is defined to return (0, TRUE), even if v - // is zero. Note that where used in this package, the denominator v - // is never zero. - { - "0000000000000000000000000000000000000000000000000000000000000000", - "0000000000000000000000000000000000000000000000000000000000000000", - 1, "0000000000000000000000000000000000000000000000000000000000000000", - }, - // 0/1 == 0² - { - "0000000000000000000000000000000000000000000000000000000000000000", - "0100000000000000000000000000000000000000000000000000000000000000", - 1, "0000000000000000000000000000000000000000000000000000000000000000", - }, - // If u is non-zero and v is zero, defined to return (0, FALSE). - { - "0100000000000000000000000000000000000000000000000000000000000000", - "0000000000000000000000000000000000000000000000000000000000000000", - 0, "0000000000000000000000000000000000000000000000000000000000000000", - }, - // 2/1 is not square in this field. - { - "0200000000000000000000000000000000000000000000000000000000000000", - "0100000000000000000000000000000000000000000000000000000000000000", - 0, "3c5ff1b5d8e4113b871bd052f9e7bcd0582804c266ffb2d4f4203eb07fdb7c54", - }, - // 4/1 == 2² - { - "0400000000000000000000000000000000000000000000000000000000000000", - "0100000000000000000000000000000000000000000000000000000000000000", - 1, "0200000000000000000000000000000000000000000000000000000000000000", - }, - // 1/4 == (2⁻¹)² == (2^(p-2))² per Euler's theorem - { - "0100000000000000000000000000000000000000000000000000000000000000", - "0400000000000000000000000000000000000000000000000000000000000000", - 1, "f6ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff3f", - }, - } - - for i, tt := range tests { - u := new(Element).SetBytes(decodeHex(tt.u)) - v := new(Element).SetBytes(decodeHex(tt.v)) - want := new(Element).SetBytes(decodeHex(tt.r)) - got, wasSquare := new(Element).SqrtRatio(u, v) - if got.Equal(want) == 0 || wasSquare != tt.wasSquare { - t.Errorf("%d: got (%v, %v), want (%v, %v)", i, got, wasSquare, want, tt.wasSquare) - } - } -} - -func TestCarryPropagate(t *testing.T) { - asmLikeGeneric := func(a [5]uint64) bool { - t1 := &Element{a[0], a[1], a[2], a[3], a[4]} - t2 := &Element{a[0], a[1], a[2], a[3], a[4]} - - t1.carryPropagate() - t2.carryPropagateGeneric() - - if *t1 != *t2 { - t.Logf("got: %#v,\nexpected: %#v", t1, t2) - } - - return *t1 == *t2 && isInBounds(t2) - } - - if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { - t.Error(err) - } - - if !asmLikeGeneric([5]uint64{0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}) { - t.Errorf("failed for {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}") - } -} - -func TestFeSquare(t *testing.T) { - asmLikeGeneric := func(a Element) bool { - t1 := a - t2 := a - - feSquareGeneric(&t1, &t1) - feSquare(&t2, &t2) - - if t1 != t2 { - t.Logf("got: %#v,\nexpected: %#v", t1, t2) - } - - return t1 == t2 && isInBounds(&t2) - } - - if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func TestFeMul(t *testing.T) { - asmLikeGeneric := func(a, b Element) bool { - a1 := a - a2 := a - b1 := b - b2 := b - - feMulGeneric(&a1, &a1, &b1) - feMul(&a2, &a2, &b2) - - if a1 != a2 || b1 != b2 { - t.Logf("got: %#v,\nexpected: %#v", a1, a2) - t.Logf("got: %#v,\nexpected: %#v", b1, b2) - } - - return a1 == a2 && isInBounds(&a2) && - b1 == b2 && isInBounds(&b2) - } - - if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func decodeHex(s string) []byte { - b, err := hex.DecodeString(s) - if err != nil { - panic(err) - } - return b -} diff --git a/curve25519/internal/field/sync.checkpoint b/curve25519/internal/field/sync.checkpoint deleted file mode 100644 index e3685f95ca..0000000000 --- a/curve25519/internal/field/sync.checkpoint +++ /dev/null @@ -1 +0,0 @@ -b0c49ae9f59d233526f8934262c5bbbe14d4358d diff --git a/curve25519/internal/field/sync.sh b/curve25519/internal/field/sync.sh deleted file mode 100755 index 1ba22a8b4c..0000000000 --- a/curve25519/internal/field/sync.sh +++ /dev/null @@ -1,19 +0,0 @@ -#! /bin/bash -set -euo pipefail - -cd "$(git rev-parse --show-toplevel)" - -STD_PATH=src/crypto/ed25519/internal/edwards25519/field -LOCAL_PATH=curve25519/internal/field -LAST_SYNC_REF=$(cat $LOCAL_PATH/sync.checkpoint) - -git fetch https://go.googlesource.com/go master - -if git diff --quiet $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH; then - echo "No changes." -else - NEW_REF=$(git rev-parse FETCH_HEAD | tee $LOCAL_PATH/sync.checkpoint) - echo "Applying changes from $LAST_SYNC_REF to $NEW_REF..." - git diff $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH | \ - git apply -3 --directory=$LOCAL_PATH -fi diff --git a/ed25519/ed25519.go b/ed25519/ed25519.go index a7828345fc..59b3a95a7d 100644 --- a/ed25519/ed25519.go +++ b/ed25519/ed25519.go @@ -11,9 +11,7 @@ // operations with the same key more efficient. This package refers to the RFC // 8032 private key as the “seed”. // -// Beginning with Go 1.13, the functionality of this package was moved to the -// standard library as crypto/ed25519. This package only acts as a compatibility -// wrapper. +// This package is a wrapper around the standard library crypto/ed25519 package. package ed25519 import ( diff --git a/go.mod b/go.mod index cc1cf7ebed..bef246f5ec 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module golang.org/x/crypto -go 1.18 +go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore diff --git a/hkdf/hkdf.go b/hkdf/hkdf.go index f4ded5fee2..3bee66294e 100644 --- a/hkdf/hkdf.go +++ b/hkdf/hkdf.go @@ -8,7 +8,7 @@ // HKDF is a cryptographic key derivation function (KDF) with the goal of // expanding limited input keying material into one or more cryptographically // strong secret keys. -package hkdf // import "golang.org/x/crypto/hkdf" +package hkdf import ( "crypto/hmac" diff --git a/internal/wycheproof/ecdh_stdlib_test.go b/internal/wycheproof/ecdh_stdlib_test.go index f7abd6b2e8..24a83e245b 100644 --- a/internal/wycheproof/ecdh_stdlib_test.go +++ b/internal/wycheproof/ecdh_stdlib_test.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.20 - package wycheproof import ( diff --git a/internal/wycheproof/eddsa_test.go b/internal/wycheproof/eddsa_test.go index 9b97490955..a74f343f02 100644 --- a/internal/wycheproof/eddsa_test.go +++ b/internal/wycheproof/eddsa_test.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.13 - package wycheproof import ( diff --git a/md4/md4.go b/md4/md4.go index d1911c2e86..7d9281e025 100644 --- a/md4/md4.go +++ b/md4/md4.go @@ -7,7 +7,7 @@ // Deprecated: MD4 is cryptographically broken and should only be used // where compatibility with legacy systems, not security, is the goal. Instead, // use a secure hash like SHA-256 (from crypto/sha256). -package md4 // import "golang.org/x/crypto/md4" +package md4 import ( "crypto" diff --git a/nacl/box/box.go b/nacl/box/box.go index 7f3b830ee2..357bdc773c 100644 --- a/nacl/box/box.go +++ b/nacl/box/box.go @@ -35,7 +35,7 @@ Anonymous sealing/opening is an extension of NaCl defined by and interoperable with libsodium: https://libsodium.gitbook.io/doc/public-key_cryptography/sealed_boxes. */ -package box // import "golang.org/x/crypto/nacl/box" +package box import ( cryptorand "crypto/rand" diff --git a/nacl/secretbox/secretbox.go b/nacl/secretbox/secretbox.go index f3c3242a04..1fe600ad03 100644 --- a/nacl/secretbox/secretbox.go +++ b/nacl/secretbox/secretbox.go @@ -32,7 +32,7 @@ chunk size. This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html. */ -package secretbox // import "golang.org/x/crypto/nacl/secretbox" +package secretbox import ( "golang.org/x/crypto/internal/alias" diff --git a/ocsp/ocsp.go b/ocsp/ocsp.go index bf2259537d..e6c645e7ce 100644 --- a/ocsp/ocsp.go +++ b/ocsp/ocsp.go @@ -5,7 +5,7 @@ // Package ocsp parses OCSP responses as specified in RFC 2560. OCSP responses // are signed messages attesting to the validity of a certificate for a small // period of time. This is used to manage revocation for X.509 certificates. -package ocsp // import "golang.org/x/crypto/ocsp" +package ocsp import ( "crypto" diff --git a/ocsp/ocsp_test.go b/ocsp/ocsp_test.go index a23d29bd1f..6dc273ec28 100644 --- a/ocsp/ocsp_test.go +++ b/ocsp/ocsp_test.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.7 - package ocsp import ( diff --git a/openpgp/armor/armor.go b/openpgp/armor/armor.go index 8907183ec0..e664d127cb 100644 --- a/openpgp/armor/armor.go +++ b/openpgp/armor/armor.go @@ -10,14 +10,15 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package armor // import "golang.org/x/crypto/openpgp/armor" +package armor import ( "bufio" "bytes" "encoding/base64" - "golang.org/x/crypto/openpgp/errors" "io" + + "golang.org/x/crypto/openpgp/errors" ) // A Block represents an OpenPGP armored structure. diff --git a/openpgp/clearsign/clearsign.go b/openpgp/clearsign/clearsign.go index 644b2e078b..cea48efdcd 100644 --- a/openpgp/clearsign/clearsign.go +++ b/openpgp/clearsign/clearsign.go @@ -13,7 +13,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package clearsign // import "golang.org/x/crypto/openpgp/clearsign" +package clearsign import ( "bufio" diff --git a/openpgp/elgamal/elgamal.go b/openpgp/elgamal/elgamal.go index 743b35a120..f922bdbcaa 100644 --- a/openpgp/elgamal/elgamal.go +++ b/openpgp/elgamal/elgamal.go @@ -16,7 +16,7 @@ // https://golang.org/issue/44226), and ElGamal in the OpenPGP ecosystem has // compatibility and security issues (see https://eprint.iacr.org/2021/923). // Moreover, this package doesn't protect against side-channel attacks. -package elgamal // import "golang.org/x/crypto/openpgp/elgamal" +package elgamal import ( "crypto/rand" diff --git a/openpgp/errors/errors.go b/openpgp/errors/errors.go index 1d7a0ea05a..a328749471 100644 --- a/openpgp/errors/errors.go +++ b/openpgp/errors/errors.go @@ -9,7 +9,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package errors // import "golang.org/x/crypto/openpgp/errors" +package errors import ( "strconv" diff --git a/openpgp/packet/packet.go b/openpgp/packet/packet.go index 0a19794a8e..a84a1a214e 100644 --- a/openpgp/packet/packet.go +++ b/openpgp/packet/packet.go @@ -10,7 +10,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package packet // import "golang.org/x/crypto/openpgp/packet" +package packet import ( "bufio" diff --git a/openpgp/read.go b/openpgp/read.go index 48a8931468..cff3db9196 100644 --- a/openpgp/read.go +++ b/openpgp/read.go @@ -9,7 +9,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package openpgp // import "golang.org/x/crypto/openpgp" +package openpgp import ( "crypto" diff --git a/openpgp/s2k/s2k.go b/openpgp/s2k/s2k.go index f53244a1c7..fa1a919079 100644 --- a/openpgp/s2k/s2k.go +++ b/openpgp/s2k/s2k.go @@ -10,7 +10,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package s2k // import "golang.org/x/crypto/openpgp/s2k" +package s2k import ( "crypto" diff --git a/otr/otr.go b/otr/otr.go index 29121e9bb6..6210c1ae61 100644 --- a/otr/otr.go +++ b/otr/otr.go @@ -8,7 +8,7 @@ // The version of OTR implemented by this package has been deprecated // (https://bugs.otr.im/lib/libotr/issues/140). An implementation of OTRv3 is // available at https://github.com/coyim/otr3. -package otr // import "golang.org/x/crypto/otr" +package otr import ( "bytes" diff --git a/pbkdf2/pbkdf2.go b/pbkdf2/pbkdf2.go index 904b57e01d..28cd99c7f3 100644 --- a/pbkdf2/pbkdf2.go +++ b/pbkdf2/pbkdf2.go @@ -16,7 +16,7 @@ Hash Functions SHA-1, SHA-224, SHA-256, SHA-384 and SHA-512 for HMAC. To choose, you can pass the `New` functions from the different SHA packages to pbkdf2.Key. */ -package pbkdf2 // import "golang.org/x/crypto/pbkdf2" +package pbkdf2 import ( "crypto/hmac" diff --git a/poly1305/poly1305_compat.go b/poly1305/poly1305_compat.go index dd975a32c9..cb9207f300 100644 --- a/poly1305/poly1305_compat.go +++ b/poly1305/poly1305_compat.go @@ -21,7 +21,7 @@ // For encryption, use the full ChaCha20-Poly1305 construction implemented by // golang.org/x/crypto/chacha20poly1305. For authentication, use a general // purpose MAC such as HMAC implemented by crypto/hmac. -package poly1305 // import "golang.org/x/crypto/poly1305" +package poly1305 import "golang.org/x/crypto/internal/poly1305" diff --git a/ripemd160/ripemd160.go b/ripemd160/ripemd160.go index cf3eeb158a..b6d33ef074 100644 --- a/ripemd160/ripemd160.go +++ b/ripemd160/ripemd160.go @@ -7,7 +7,7 @@ // Deprecated: RIPEMD-160 is a legacy hash and should not be used for new // applications. Also, this package does not and will not provide an optimized // implementation. Instead, use a modern hash like SHA-256 (from crypto/sha256). -package ripemd160 // import "golang.org/x/crypto/ripemd160" +package ripemd160 // RIPEMD-160 is designed by Hans Dobbertin, Antoon Bosselaers, and Bart // Preneel with specifications available at: diff --git a/salsa20/salsa/hsalsa20.go b/salsa20/salsa/hsalsa20.go index 3fd05b2751..3685b34458 100644 --- a/salsa20/salsa/hsalsa20.go +++ b/salsa20/salsa/hsalsa20.go @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. // Package salsa provides low-level access to functions in the Salsa family. -package salsa // import "golang.org/x/crypto/salsa20/salsa" +package salsa import "math/bits" diff --git a/salsa20/salsa20.go b/salsa20/salsa20.go index 8f4f896c70..e75c9342a8 100644 --- a/salsa20/salsa20.go +++ b/salsa20/salsa20.go @@ -19,7 +19,7 @@ This package also implements XSalsa20: a version of Salsa20 with a 24-byte nonce as specified in https://cr.yp.to/snuffle/xsalsa-20081128.pdf. Simply passing a 24-byte slice as the nonce triggers XSalsa20. */ -package salsa20 // import "golang.org/x/crypto/salsa20" +package salsa20 // TODO(agl): implement XORKeyStream12 and XORKeyStream8 - the reduced round variants of Salsa20. diff --git a/scrypt/scrypt.go b/scrypt/scrypt.go index c971a99fa6..76fa40fb20 100644 --- a/scrypt/scrypt.go +++ b/scrypt/scrypt.go @@ -5,7 +5,7 @@ // Package scrypt implements the scrypt key derivation function as defined in // Colin Percival's paper "Stronger Key Derivation via Sequential Memory-Hard // Functions" (https://www.tarsnap.com/scrypt/scrypt.pdf). -package scrypt // import "golang.org/x/crypto/scrypt" +package scrypt import ( "crypto/sha256" diff --git a/sha3/doc.go b/sha3/doc.go index decd8cf9bf..7e02309070 100644 --- a/sha3/doc.go +++ b/sha3/doc.go @@ -59,4 +59,4 @@ // They produce output of the same length, with the same security strengths // against all attacks. This means, in particular, that SHA3-256 only has // 128-bit collision resistance, because its output length is 32 bytes. -package sha3 // import "golang.org/x/crypto/sha3" +package sha3 diff --git a/sha3/hashes.go b/sha3/hashes.go index 5eae6cb922..c544b29e5f 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -9,6 +9,7 @@ package sha3 // bytes. import ( + "crypto" "hash" ) @@ -40,6 +41,13 @@ func New512() hash.Hash { return new512() } +func init() { + crypto.RegisterHash(crypto.SHA3_224, New224) + crypto.RegisterHash(crypto.SHA3_256, New256) + crypto.RegisterHash(crypto.SHA3_384, New384) + crypto.RegisterHash(crypto.SHA3_512, New512) +} + func new224Generic() *state { return &state{rate: 144, outputLen: 28, dsbyte: 0x06} } diff --git a/sha3/register.go b/sha3/register.go deleted file mode 100644 index addfd5049b..0000000000 --- a/sha3/register.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.4 - -package sha3 - -import ( - "crypto" -) - -func init() { - crypto.RegisterHash(crypto.SHA3_224, New224) - crypto.RegisterHash(crypto.SHA3_256, New256) - crypto.RegisterHash(crypto.SHA3_384, New384) - crypto.RegisterHash(crypto.SHA3_512, New512) -} diff --git a/ssh/agent/client.go b/ssh/agent/client.go index fecba8eb38..106708d289 100644 --- a/ssh/agent/client.go +++ b/ssh/agent/client.go @@ -10,7 +10,7 @@ // References: // // [PROTOCOL.agent]: https://tools.ietf.org/html/draft-miller-ssh-agent-00 -package agent // import "golang.org/x/crypto/ssh/agent" +package agent import ( "bytes" diff --git a/ssh/doc.go b/ssh/doc.go index edbe63340d..f5d352fe3a 100644 --- a/ssh/doc.go +++ b/ssh/doc.go @@ -20,4 +20,4 @@ References: This package does not fall under the stability promise of the Go language itself, so its API may be changed when pressing needs arise. */ -package ssh // import "golang.org/x/crypto/ssh" +package ssh diff --git a/ssh/test/doc.go b/ssh/test/doc.go index 198f0ca1e2..444b29966b 100644 --- a/ssh/test/doc.go +++ b/ssh/test/doc.go @@ -4,4 +4,4 @@ // Package test contains integration tests for the // golang.org/x/crypto/ssh package. -package test // import "golang.org/x/crypto/ssh/test" +package test diff --git a/ssh/testdata/doc.go b/ssh/testdata/doc.go index fcae47ca68..ae7bd8b89e 100644 --- a/ssh/testdata/doc.go +++ b/ssh/testdata/doc.go @@ -5,4 +5,4 @@ // This package contains test data shared between the various subpackages of // the golang.org/x/crypto/ssh package. Under no circumstance should // this data be used for production code. -package testdata // import "golang.org/x/crypto/ssh/testdata" +package testdata diff --git a/twofish/twofish.go b/twofish/twofish.go index e4eeae17f4..6d0a3028d3 100644 --- a/twofish/twofish.go +++ b/twofish/twofish.go @@ -9,7 +9,7 @@ // implementation. Instead, use AES (from crypto/aes, if necessary in an AEAD // mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package twofish // import "golang.org/x/crypto/twofish" +package twofish // Twofish is defined in https://www.schneier.com/paper-twofish-paper.pdf [TWOFISH] diff --git a/xtea/cipher.go b/xtea/cipher.go index a4c2fd02b3..7b4f8aaa6b 100644 --- a/xtea/cipher.go +++ b/xtea/cipher.go @@ -12,7 +12,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package xtea // import "golang.org/x/crypto/xtea" +package xtea // For details, see http://www.cix.co.uk/~klockstone/xtea.pdf diff --git a/xts/xts.go b/xts/xts.go index 8c16a83014..d64f536f9d 100644 --- a/xts/xts.go +++ b/xts/xts.go @@ -21,7 +21,7 @@ // // Note that XTS is usually not appropriate for any use besides disk encryption. // Most users should use an AEAD mode like GCM (from crypto/cipher.NewGCM) instead. -package xts // import "golang.org/x/crypto/xts" +package xts import ( "crypto/cipher" From 9fadb0b165bd3b96d2a21e89d60ad458db3aeee0 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Thu, 4 Jul 2024 19:17:23 +0000 Subject: [PATCH 59/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I0024d2803498fe98af80f14a4476a504c9aa6b5b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/596835 Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Than McIntosh Auto-Submit: Gopher Robot --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index bef246f5ec..7e3b4eca74 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.21.0 - golang.org/x/term v0.21.0 + golang.org/x/sys v0.22.0 + golang.org/x/term v0.22.0 ) require golang.org/x/text v0.16.0 // indirect diff --git a/go.sum b/go.sum index b694c49e61..32e3e58b33 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= From d66d9c31b4ae80d173d1187a9e40c188788dbdbc Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 8 Jul 2024 16:01:01 +0000 Subject: [PATCH 60/95] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: I552ff9800e32294b25cc04ccc8fca3404ae3b93c Reviewed-on: https://go-review.googlesource.com/c/crypto/+/597095 Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee Reviewed-by: Roland Shoemaker --- x509roots/fallback/bundle.go | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index e56011afa0..35d7c1c550 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -1578,40 +1578,6 @@ XR4EzzffHqhmsYzmIGrv/EhOdJhCrylvLmrH+33RZjEizIYAfmaDDEL0vTSSwxrq T8p+ck0LcIymSLumoRT2+1hEmRSuqguTaaApJUqlyyvdimYHFngVV3Eb7PVHhPOe MTd61X8kreS8/f3MboPoDKi3QWwH3b08hpcv0g== -----END CERTIFICATE----- -# CN=GLOBALTRUST 2020,O=e-commerce monitoring GmbH,C=AT -# 9a296a5182d1d451a2e37f439b74daafa267523329f90f9a0d2007c334e23c9a ------BEGIN CERTIFICATE----- -MIIFgjCCA2qgAwIBAgILWku9WvtPilv6ZeUwDQYJKoZIhvcNAQELBQAwTTELMAkG -A1UEBhMCQVQxIzAhBgNVBAoTGmUtY29tbWVyY2UgbW9uaXRvcmluZyBHbWJIMRkw -FwYDVQQDExBHTE9CQUxUUlVTVCAyMDIwMB4XDTIwMDIxMDAwMDAwMFoXDTQwMDYx -MDAwMDAwMFowTTELMAkGA1UEBhMCQVQxIzAhBgNVBAoTGmUtY29tbWVyY2UgbW9u -aXRvcmluZyBHbWJIMRkwFwYDVQQDExBHTE9CQUxUUlVTVCAyMDIwMIICIjANBgkq -hkiG9w0BAQEFAAOCAg8AMIICCgKCAgEAri5WrRsc7/aVj6B3GyvTY4+ETUWiD59b -RatZe1E0+eyLinjF3WuvvcTfk0Uev5E4C64OFudBc/jbu9G4UeDLgztzOG53ig9Z -YybNpyrOVPu44sB8R85gfD+yc/LAGbaKkoc1DZAoouQVBGM+uq/ufF7MpotQsjj3 -QWPKzv9pj2gOlTblzLmMCcpL3TGQlsjMH/1WljTbjhzqLL6FLmPdqqmV0/0plRPw -yJiT2S0WR5ARg6I6IqIoV6Lr/sCMKKCmfecqQjuCgGOlYx8ZzHyyZqjC0203b+J+ -BlHZRYQfEs4kUmSFC0iAToexIiIwquuuvuAC4EDosEKAA1GqtH6qRNdDYfOiaxaJ -SaSjpCuKAsR49GiKweR6NrFvG5Ybd0mN1MkGco/PU+PcF4UgStyYJ9ORJitHHmkH -r96i5OTUawuzXnzUJIBHKWk7buis/UDr2O1xcSvy6Fgd60GXIsUf1DnQJ4+H4xj0 -4KlGDfV0OoIu0G4skaMxXDtG6nsEEFZegB31pWXogvziB4xiRfUg3kZwhqG8k9Me -dKZssCz3AwyIDMvUclOGvGBG85hqwvG/Q/lwIHfKN0F5VVJjjVsSn8VoxIidrPIw -q7ejMZdnrY8XD2zHc+0klGvIg5rQmjdJBKuxFshsSUktq6HQjJLyQUp5ISXbY9e2 -nKd+Qmn7OmMCAwEAAaNjMGEwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMC -AQYwHQYDVR0OBBYEFNwuH9FhN3nkq9XVsxJxaD1qaJwiMB8GA1UdIwQYMBaAFNwu -H9FhN3nkq9XVsxJxaD1qaJwiMA0GCSqGSIb3DQEBCwUAA4ICAQCR8EICaEDuw2jA -VC/f7GLDw56KoDEoqoOOpFaWEhCGVrqXctJUMHytGdUdaG/7FELYjQ7ztdGl4wJC -XtzoRlgHNQIw4Lx0SsFDKv/bGtCwr2zD/cuz9X9tAy5ZVp0tLTWMstZDFyySCstd -6IwPS3BD0IL/qMy/pJTAvoe9iuOTe8aPmxadJ2W8esVCgmxcB9CpwYhgROmYhRZf -+I/KARDOJcP5YBugxZfD0yyIMaK9MOzQ0MAS8cE54+X1+NZK3TTN+2/BT+MAi1bi -kvcoskJ3ciNnxz8RFbLEAwW+uxF7Cr+obuf/WEPPm2eggAe2HcqtbepBEX4tdJP7 -wry+UUTF72glJ4DjyKDUEuzZpTcdN3y0kcra1LGWge9oXHYQSa9+pTeAsRxSvTOB -TI/53WXZFM2KJVj04sWDpQmQ1GwUY7VA3+vA/MRYfg0UFodUJ25W5HCEuGwyEn6C -MUO+1918oa2u1qsgEu8KwxCMSZY13At1XrFP1U80DhEgB3VDRemjEdqso5nCtnkn -4rnvyOL2NSl6dPrFf4IFYqYK6miyeUcGbvJXqBUzxvd4Sj1Ce2t+/vdG6tHrju+I -aFvowdlxfv1k7/9nR4hYJS8+hge9+6jlgqispdNpQ80xiEmEU5LAsTkbOYMBMMTy -qfrQA71yN2BWHzZ8vTmR9W0Nv3vXkg== ------END CERTIFICATE----- # CN=GTS Root R1,O=Google Trust Services LLC,C=US # d947432abde7b7fa90fc2e6b59101b1280e0e1c7e4e40fa3c6887fff57a7f4cf -----BEGIN CERTIFICATE----- From f2bc3a617ab63149578b696bd2db66ae298b3ba1 Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Tue, 16 Jul 2024 10:13:29 -0400 Subject: [PATCH 61/95] x509roots/fallback/internal/goissue52287: delete By now Go 1.19 isn't supported, so there's no need to work around go.dev/issue/52287 in this module anymore. For golang/go#57792. For golang/go#52287. Change-Id: I3999cdb9ca419a2ab897c9143a4ec31f59da7d80 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598495 TryBot-Result: Gopher Robot Run-TryBot: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Dmitri Shuralyov Reviewed-by: Roland Shoemaker --- x509roots/fallback/internal/goissue52287/goissue52287.go | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 x509roots/fallback/internal/goissue52287/goissue52287.go diff --git a/x509roots/fallback/internal/goissue52287/goissue52287.go b/x509roots/fallback/internal/goissue52287/goissue52287.go deleted file mode 100644 index d946a527db..0000000000 --- a/x509roots/fallback/internal/goissue52287/goissue52287.go +++ /dev/null @@ -1,8 +0,0 @@ -// Copyright 2023 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package goissue52287 is an empty internal package. -// It exists only to work around go.dev/issue/52287 and -// can be removed after Go 1.19 stops being supported. -package goissue52287 From 80fd97208db0a6f1c2dccfc63ccde57b4e994875 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 16 Jul 2024 11:35:18 -0400 Subject: [PATCH 62/95] LICENSE: update per Google Legal Very minor tweaks: - Remove (c) pseudosymbol. - Remove "All Rights Reserved." - Change "Google Inc." (no longer exists) to "Google LLC". [git-generate] echo ' ,s/\(c\) // ,s/ All rights reserved.// ,s/Google Inc./Google LLC/ w q ' | sam -d LICENSE Change-Id: I6e885650c5701597f57dbf00c2abdcc7b393a703 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598520 LUCI-TryBot-Result: Go LUCI Reviewed-by: Ian Lance Taylor Auto-Submit: Russ Cox --- LICENSE | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index 6a66aea5ea..2a7cf70da6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. From e983fa27418787af5c51d172b508cd85bc6644d0 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Mon, 24 Jun 2024 18:56:28 -0400 Subject: [PATCH 63/95] sha3: Avo port of keccakf_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: BASE="d66d9c31b4ae80d173d1187a9e40c188788dbdbc" go tool asm -o /dev/null -debug \ <(git cat-file -p "$BASE":sha3/keccakf_amd64.s) \ > /tmp/reference.s go tool asm -o /dev/null -debug \ sha3/keccakf_amd64.s \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I1c0ea516531355263b83d3b66a37df090e293cea Reviewed-on: https://go-review.googlesource.com/c/crypto/+/594655 Reviewed-by: Cherry Mui Reviewed-by: Filippo Valsorda Reviewed-by: Russell Webb Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda --- sha3/_asm/go.mod | 15 + sha3/_asm/go.sum | 12 + sha3/_asm/keccakf_amd64_asm.go | 438 +++ sha3/keccakf_amd64.s | 5787 +++++++++++++++++++++++++++++--- 4 files changed, 5873 insertions(+), 379 deletions(-) create mode 100644 sha3/_asm/go.mod create mode 100644 sha3/_asm/go.sum create mode 100644 sha3/_asm/keccakf_amd64_asm.go diff --git a/sha3/_asm/go.mod b/sha3/_asm/go.mod new file mode 100644 index 0000000000..265a88d077 --- /dev/null +++ b/sha3/_asm/go.mod @@ -0,0 +1,15 @@ +module sha3/_asm + +go 1.22 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.25.0 +) + +require ( + golang.org/x/mod v0.19.0 // indirect + golang.org/x/sync v0.7.0 // indirect + golang.org/x/sys v0.22.0 // indirect + golang.org/x/tools v0.23.0 // indirect +) diff --git a/sha3/_asm/go.sum b/sha3/_asm/go.sum new file mode 100644 index 0000000000..a2552b8eb9 --- /dev/null +++ b/sha3/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= +golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8= +golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg= +golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= diff --git a/sha3/_asm/keccakf_amd64_asm.go b/sha3/_asm/keccakf_amd64_asm.go new file mode 100644 index 0000000000..78e931f757 --- /dev/null +++ b/sha3/_asm/keccakf_amd64_asm.go @@ -0,0 +1,438 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This code was translated into a form compatible with 6a from the public +// domain sources at https://github.com/gvanas/KeccakCodePackage + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/sha3" +) + +//go:generate go run . -out ../keccakf_amd64.s -pkg sha3 + +// Round Constants for use in the ι step. +var RoundConstants = [24]uint64{ + 0x0000000000000001, + 0x0000000000008082, + 0x800000000000808A, + 0x8000000080008000, + 0x000000000000808B, + 0x0000000080000001, + 0x8000000080008081, + 0x8000000000008009, + 0x000000000000008A, + 0x0000000000000088, + 0x0000000080008009, + 0x000000008000000A, + 0x000000008000808B, + 0x800000000000008B, + 0x8000000000008089, + 0x8000000000008003, + 0x8000000000008002, + 0x8000000000000080, + 0x000000000000800A, + 0x800000008000000A, + 0x8000000080008081, + 0x8000000000008080, + 0x0000000080000001, + 0x8000000080008008, +} + +var ( + // Temporary registers + rT1 GPPhysical = RAX + + // Round vars + rpState = Mem{Base: RDI} + rpStack = Mem{Base: RSP} + + rDa = RBX + rDe = RCX + rDi = RDX + rDo = R8 + rDu = R9 + + rBa = R10 + rBe = R11 + rBi = R12 + rBo = R13 + rBu = R14 + + rCa = RSI + rCe = RBP + rCi = rBi + rCo = rBo + rCu = R15 +) + +const ( + _ba = iota * 8 + _be + _bi + _bo + _bu + _ga + _ge + _gi + _go + _gu + _ka + _ke + _ki + _ko + _ku + _ma + _me + _mi + _mo + _mu + _sa + _se + _si + _so + _su +) + +func main() { + Package("golang.org/x/crypto/sha3") + ConstraintExpr("amd64,!purego,gc") + keccakF1600() + Generate() +} + +func MOVQ_RBI_RCE() { MOVQ(rBi, rCe) } +func XORQ_RT1_RCA() { XORQ(rT1, rCa) } +func XORQ_RT1_RCE() { XORQ(rT1, rCe) } +func XORQ_RBA_RCU() { XORQ(rBa, rCu) } +func XORQ_RBE_RCU() { XORQ(rBe, rCu) } +func XORQ_RDU_RCU() { XORQ(rDu, rCu) } +func XORQ_RDA_RCA() { XORQ(rDa, rCa) } +func XORQ_RDE_RCE() { XORQ(rDe, rCe) } + +type ArgMacro func() + +func mKeccakRound( + iState, oState Mem, + rc U64, + B_RBI_RCE, G_RT1_RCA, G_RT1_RCE, G_RBA_RCU, + K_RT1_RCA, K_RT1_RCE, K_RBA_RCU, M_RT1_RCA, + M_RT1_RCE, M_RBE_RCU, S_RDU_RCU, S_RDA_RCA, + S_RDE_RCE ArgMacro, +) { + Comment("Prepare round") + MOVQ(rCe, rDa) + ROLQ(Imm(1), rDa) + + MOVQ(iState.Offset(_bi), rCi) + XORQ(iState.Offset(_gi), rDi) + XORQ(rCu, rDa) + XORQ(iState.Offset(_ki), rCi) + XORQ(iState.Offset(_mi), rDi) + XORQ(rDi, rCi) + + MOVQ(rCi, rDe) + ROLQ(Imm(1), rDe) + + MOVQ(iState.Offset(_bo), rCo) + XORQ(iState.Offset(_go), rDo) + XORQ(rCa, rDe) + XORQ(iState.Offset(_ko), rCo) + XORQ(iState.Offset(_mo), rDo) + XORQ(rDo, rCo) + + MOVQ(rCo, rDi) + ROLQ(Imm(1), rDi) + + MOVQ(rCu, rDo) + XORQ(rCe, rDi) + ROLQ(Imm(1), rDo) + + MOVQ(rCa, rDu) + XORQ(rCi, rDo) + ROLQ(Imm(1), rDu) + + Comment("Result b") + MOVQ(iState.Offset(_ba), rBa) + MOVQ(iState.Offset(_ge), rBe) + XORQ(rCo, rDu) + MOVQ(iState.Offset(_ki), rBi) + MOVQ(iState.Offset(_mo), rBo) + MOVQ(iState.Offset(_su), rBu) + XORQ(rDe, rBe) + ROLQ(Imm(44), rBe) + XORQ(rDi, rBi) + XORQ(rDa, rBa) + ROLQ(Imm(43), rBi) + + MOVQ(rBe, rCa) + MOVQ(rc, rT1) + ORQ(rBi, rCa) + XORQ(rBa, rT1) + XORQ(rT1, rCa) + MOVQ(rCa, oState.Offset(_ba)) + + XORQ(rDu, rBu) + ROLQ(Imm(14), rBu) + MOVQ(rBa, rCu) + ANDQ(rBe, rCu) + XORQ(rBu, rCu) + MOVQ(rCu, oState.Offset(_bu)) + + XORQ(rDo, rBo) + ROLQ(Imm(21), rBo) + MOVQ(rBo, rT1) + ANDQ(rBu, rT1) + XORQ(rBi, rT1) + MOVQ(rT1, oState.Offset(_bi)) + + NOTQ(rBi) + ORQ(rBa, rBu) + ORQ(rBo, rBi) + XORQ(rBo, rBu) + XORQ(rBe, rBi) + MOVQ(rBu, oState.Offset(_bo)) + MOVQ(rBi, oState.Offset(_be)) + B_RBI_RCE() + + Comment("Result g") + MOVQ(iState.Offset(_gu), rBe) + XORQ(rDu, rBe) + MOVQ(iState.Offset(_ka), rBi) + ROLQ(Imm(20), rBe) + XORQ(rDa, rBi) + ROLQ(Imm(3), rBi) + MOVQ(iState.Offset(_bo), rBa) + MOVQ(rBe, rT1) + ORQ(rBi, rT1) + XORQ(rDo, rBa) + MOVQ(iState.Offset(_me), rBo) + MOVQ(iState.Offset(_si), rBu) + ROLQ(Imm(28), rBa) + XORQ(rBa, rT1) + MOVQ(rT1, oState.Offset(_ga)) + G_RT1_RCA() + + XORQ(rDe, rBo) + ROLQ(Imm(45), rBo) + MOVQ(rBi, rT1) + ANDQ(rBo, rT1) + XORQ(rBe, rT1) + MOVQ(rT1, oState.Offset(_ge)) + G_RT1_RCE() + + XORQ(rDi, rBu) + ROLQ(Imm(61), rBu) + MOVQ(rBu, rT1) + ORQ(rBa, rT1) + XORQ(rBo, rT1) + MOVQ(rT1, oState.Offset(_go)) + + ANDQ(rBe, rBa) + XORQ(rBu, rBa) + MOVQ(rBa, oState.Offset(_gu)) + NOTQ(rBu) + G_RBA_RCU() + + ORQ(rBu, rBo) + XORQ(rBi, rBo) + MOVQ(rBo, oState.Offset(_gi)) + + Comment("Result k") + MOVQ(iState.Offset(_be), rBa) + MOVQ(iState.Offset(_gi), rBe) + MOVQ(iState.Offset(_ko), rBi) + MOVQ(iState.Offset(_mu), rBo) + MOVQ(iState.Offset(_sa), rBu) + XORQ(rDi, rBe) + ROLQ(Imm(6), rBe) + XORQ(rDo, rBi) + ROLQ(Imm(25), rBi) + MOVQ(rBe, rT1) + ORQ(rBi, rT1) + XORQ(rDe, rBa) + ROLQ(Imm(1), rBa) + XORQ(rBa, rT1) + MOVQ(rT1, oState.Offset(_ka)) + K_RT1_RCA() + + XORQ(rDu, rBo) + ROLQ(Imm(8), rBo) + MOVQ(rBi, rT1) + ANDQ(rBo, rT1) + XORQ(rBe, rT1) + MOVQ(rT1, oState.Offset(_ke)) + K_RT1_RCE() + + XORQ(rDa, rBu) + ROLQ(Imm(18), rBu) + NOTQ(rBo) + MOVQ(rBo, rT1) + ANDQ(rBu, rT1) + XORQ(rBi, rT1) + MOVQ(rT1, oState.Offset(_ki)) + + MOVQ(rBu, rT1) + ORQ(rBa, rT1) + XORQ(rBo, rT1) + MOVQ(rT1, oState.Offset(_ko)) + + ANDQ(rBe, rBa) + XORQ(rBu, rBa) + MOVQ(rBa, oState.Offset(_ku)) + K_RBA_RCU() + + Comment("Result m") + MOVQ(iState.Offset(_ga), rBe) + XORQ(rDa, rBe) + MOVQ(iState.Offset(_ke), rBi) + ROLQ(Imm(36), rBe) + XORQ(rDe, rBi) + MOVQ(iState.Offset(_bu), rBa) + ROLQ(Imm(10), rBi) + MOVQ(rBe, rT1) + MOVQ(iState.Offset(_mi), rBo) + ANDQ(rBi, rT1) + XORQ(rDu, rBa) + MOVQ(iState.Offset(_so), rBu) + ROLQ(Imm(27), rBa) + XORQ(rBa, rT1) + MOVQ(rT1, oState.Offset(_ma)) + M_RT1_RCA() + + XORQ(rDi, rBo) + ROLQ(Imm(15), rBo) + MOVQ(rBi, rT1) + ORQ(rBo, rT1) + XORQ(rBe, rT1) + MOVQ(rT1, oState.Offset(_me)) + M_RT1_RCE() + + XORQ(rDo, rBu) + ROLQ(Imm(56), rBu) + NOTQ(rBo) + MOVQ(rBo, rT1) + ORQ(rBu, rT1) + XORQ(rBi, rT1) + MOVQ(rT1, oState.Offset(_mi)) + + ORQ(rBa, rBe) + XORQ(rBu, rBe) + MOVQ(rBe, oState.Offset(_mu)) + + ANDQ(rBa, rBu) + XORQ(rBo, rBu) + MOVQ(rBu, oState.Offset(_mo)) + M_RBE_RCU() + + Comment("Result s") + MOVQ(iState.Offset(_bi), rBa) + MOVQ(iState.Offset(_go), rBe) + MOVQ(iState.Offset(_ku), rBi) + XORQ(rDi, rBa) + MOVQ(iState.Offset(_ma), rBo) + ROLQ(Imm(62), rBa) + XORQ(rDo, rBe) + MOVQ(iState.Offset(_se), rBu) + ROLQ(Imm(55), rBe) + + XORQ(rDu, rBi) + MOVQ(rBa, rDu) + XORQ(rDe, rBu) + ROLQ(Imm(2), rBu) + ANDQ(rBe, rDu) + XORQ(rBu, rDu) + MOVQ(rDu, oState.Offset(_su)) + + ROLQ(Imm(39), rBi) + S_RDU_RCU() + NOTQ(rBe) + XORQ(rDa, rBo) + MOVQ(rBe, rDa) + ANDQ(rBi, rDa) + XORQ(rBa, rDa) + MOVQ(rDa, oState.Offset(_sa)) + S_RDA_RCA() + + ROLQ(Imm(41), rBo) + MOVQ(rBi, rDe) + ORQ(rBo, rDe) + XORQ(rBe, rDe) + MOVQ(rDe, oState.Offset(_se)) + S_RDE_RCE() + + MOVQ(rBo, rDi) + MOVQ(rBu, rDo) + ANDQ(rBu, rDi) + ORQ(rBa, rDo) + XORQ(rBi, rDi) + XORQ(rBo, rDo) + MOVQ(rDi, oState.Offset(_si)) + MOVQ(rDo, oState.Offset(_so)) +} + +// keccakF1600 applies the Keccak permutation to a 1600b-wide +// state represented as a slice of 25 uint64s. +func keccakF1600() { + Implement("keccakF1600") + AllocLocal(200) + + Load(Param("a"), rpState.Base) + + Comment("Convert the user state into an internal state") + NOTQ(rpState.Offset(_be)) + NOTQ(rpState.Offset(_bi)) + NOTQ(rpState.Offset(_go)) + NOTQ(rpState.Offset(_ki)) + NOTQ(rpState.Offset(_mi)) + NOTQ(rpState.Offset(_sa)) + + Comment("Execute the KeccakF permutation") + MOVQ(rpState.Offset(_ba), rCa) + MOVQ(rpState.Offset(_be), rCe) + MOVQ(rpState.Offset(_bu), rCu) + + XORQ(rpState.Offset(_ga), rCa) + XORQ(rpState.Offset(_ge), rCe) + XORQ(rpState.Offset(_gu), rCu) + + XORQ(rpState.Offset(_ka), rCa) + XORQ(rpState.Offset(_ke), rCe) + XORQ(rpState.Offset(_ku), rCu) + + XORQ(rpState.Offset(_ma), rCa) + XORQ(rpState.Offset(_me), rCe) + XORQ(rpState.Offset(_mu), rCu) + + XORQ(rpState.Offset(_sa), rCa) + XORQ(rpState.Offset(_se), rCe) + MOVQ(rpState.Offset(_si), rDi) + MOVQ(rpState.Offset(_so), rDo) + XORQ(rpState.Offset(_su), rCu) + + for i, rc := range RoundConstants[:len(RoundConstants)-1] { + var iState, oState Mem + if i%2 == 0 { + iState, oState = rpState, rpStack + } else { + iState, oState = rpStack, rpState + } + mKeccakRound(iState, oState, U64(rc), MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) + } + mKeccakRound(rpStack, rpState, U64(RoundConstants[len(RoundConstants)-1]), NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP) + + Comment("Revert the internal state to the user state") + NOTQ(rpState.Offset(_be)) + NOTQ(rpState.Offset(_bi)) + NOTQ(rpState.Offset(_go)) + NOTQ(rpState.Offset(_ki)) + NOTQ(rpState.Offset(_mi)) + NOTQ(rpState.Offset(_sa)) + + RET() +} diff --git a/sha3/keccakf_amd64.s b/sha3/keccakf_amd64.s index 1f53938861..99e2f16e97 100644 --- a/sha3/keccakf_amd64.s +++ b/sha3/keccakf_amd64.s @@ -1,390 +1,5419 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run keccakf_amd64_asm.go -out ../keccakf_amd64.s -pkg sha3. DO NOT EDIT. //go:build amd64 && !purego && gc -// This code was translated into a form compatible with 6a from the public -// domain sources at https://github.com/gvanas/KeccakCodePackage - -// Offsets in state -#define _ba (0*8) -#define _be (1*8) -#define _bi (2*8) -#define _bo (3*8) -#define _bu (4*8) -#define _ga (5*8) -#define _ge (6*8) -#define _gi (7*8) -#define _go (8*8) -#define _gu (9*8) -#define _ka (10*8) -#define _ke (11*8) -#define _ki (12*8) -#define _ko (13*8) -#define _ku (14*8) -#define _ma (15*8) -#define _me (16*8) -#define _mi (17*8) -#define _mo (18*8) -#define _mu (19*8) -#define _sa (20*8) -#define _se (21*8) -#define _si (22*8) -#define _so (23*8) -#define _su (24*8) - -// Temporary registers -#define rT1 AX - -// Round vars -#define rpState DI -#define rpStack SP - -#define rDa BX -#define rDe CX -#define rDi DX -#define rDo R8 -#define rDu R9 - -#define rBa R10 -#define rBe R11 -#define rBi R12 -#define rBo R13 -#define rBu R14 - -#define rCa SI -#define rCe BP -#define rCi rBi -#define rCo rBo -#define rCu R15 - -#define MOVQ_RBI_RCE MOVQ rBi, rCe -#define XORQ_RT1_RCA XORQ rT1, rCa -#define XORQ_RT1_RCE XORQ rT1, rCe -#define XORQ_RBA_RCU XORQ rBa, rCu -#define XORQ_RBE_RCU XORQ rBe, rCu -#define XORQ_RDU_RCU XORQ rDu, rCu -#define XORQ_RDA_RCA XORQ rDa, rCa -#define XORQ_RDE_RCE XORQ rDe, rCe - -#define mKeccakRound(iState, oState, rc, B_RBI_RCE, G_RT1_RCA, G_RT1_RCE, G_RBA_RCU, K_RT1_RCA, K_RT1_RCE, K_RBA_RCU, M_RT1_RCA, M_RT1_RCE, M_RBE_RCU, S_RDU_RCU, S_RDA_RCA, S_RDE_RCE) \ - /* Prepare round */ \ - MOVQ rCe, rDa; \ - ROLQ $1, rDa; \ - \ - MOVQ _bi(iState), rCi; \ - XORQ _gi(iState), rDi; \ - XORQ rCu, rDa; \ - XORQ _ki(iState), rCi; \ - XORQ _mi(iState), rDi; \ - XORQ rDi, rCi; \ - \ - MOVQ rCi, rDe; \ - ROLQ $1, rDe; \ - \ - MOVQ _bo(iState), rCo; \ - XORQ _go(iState), rDo; \ - XORQ rCa, rDe; \ - XORQ _ko(iState), rCo; \ - XORQ _mo(iState), rDo; \ - XORQ rDo, rCo; \ - \ - MOVQ rCo, rDi; \ - ROLQ $1, rDi; \ - \ - MOVQ rCu, rDo; \ - XORQ rCe, rDi; \ - ROLQ $1, rDo; \ - \ - MOVQ rCa, rDu; \ - XORQ rCi, rDo; \ - ROLQ $1, rDu; \ - \ - /* Result b */ \ - MOVQ _ba(iState), rBa; \ - MOVQ _ge(iState), rBe; \ - XORQ rCo, rDu; \ - MOVQ _ki(iState), rBi; \ - MOVQ _mo(iState), rBo; \ - MOVQ _su(iState), rBu; \ - XORQ rDe, rBe; \ - ROLQ $44, rBe; \ - XORQ rDi, rBi; \ - XORQ rDa, rBa; \ - ROLQ $43, rBi; \ - \ - MOVQ rBe, rCa; \ - MOVQ rc, rT1; \ - ORQ rBi, rCa; \ - XORQ rBa, rT1; \ - XORQ rT1, rCa; \ - MOVQ rCa, _ba(oState); \ - \ - XORQ rDu, rBu; \ - ROLQ $14, rBu; \ - MOVQ rBa, rCu; \ - ANDQ rBe, rCu; \ - XORQ rBu, rCu; \ - MOVQ rCu, _bu(oState); \ - \ - XORQ rDo, rBo; \ - ROLQ $21, rBo; \ - MOVQ rBo, rT1; \ - ANDQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _bi(oState); \ - \ - NOTQ rBi; \ - ORQ rBa, rBu; \ - ORQ rBo, rBi; \ - XORQ rBo, rBu; \ - XORQ rBe, rBi; \ - MOVQ rBu, _bo(oState); \ - MOVQ rBi, _be(oState); \ - B_RBI_RCE; \ - \ - /* Result g */ \ - MOVQ _gu(iState), rBe; \ - XORQ rDu, rBe; \ - MOVQ _ka(iState), rBi; \ - ROLQ $20, rBe; \ - XORQ rDa, rBi; \ - ROLQ $3, rBi; \ - MOVQ _bo(iState), rBa; \ - MOVQ rBe, rT1; \ - ORQ rBi, rT1; \ - XORQ rDo, rBa; \ - MOVQ _me(iState), rBo; \ - MOVQ _si(iState), rBu; \ - ROLQ $28, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ga(oState); \ - G_RT1_RCA; \ - \ - XORQ rDe, rBo; \ - ROLQ $45, rBo; \ - MOVQ rBi, rT1; \ - ANDQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _ge(oState); \ - G_RT1_RCE; \ - \ - XORQ rDi, rBu; \ - ROLQ $61, rBu; \ - MOVQ rBu, rT1; \ - ORQ rBa, rT1; \ - XORQ rBo, rT1; \ - MOVQ rT1, _go(oState); \ - \ - ANDQ rBe, rBa; \ - XORQ rBu, rBa; \ - MOVQ rBa, _gu(oState); \ - NOTQ rBu; \ - G_RBA_RCU; \ - \ - ORQ rBu, rBo; \ - XORQ rBi, rBo; \ - MOVQ rBo, _gi(oState); \ - \ - /* Result k */ \ - MOVQ _be(iState), rBa; \ - MOVQ _gi(iState), rBe; \ - MOVQ _ko(iState), rBi; \ - MOVQ _mu(iState), rBo; \ - MOVQ _sa(iState), rBu; \ - XORQ rDi, rBe; \ - ROLQ $6, rBe; \ - XORQ rDo, rBi; \ - ROLQ $25, rBi; \ - MOVQ rBe, rT1; \ - ORQ rBi, rT1; \ - XORQ rDe, rBa; \ - ROLQ $1, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ka(oState); \ - K_RT1_RCA; \ - \ - XORQ rDu, rBo; \ - ROLQ $8, rBo; \ - MOVQ rBi, rT1; \ - ANDQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _ke(oState); \ - K_RT1_RCE; \ - \ - XORQ rDa, rBu; \ - ROLQ $18, rBu; \ - NOTQ rBo; \ - MOVQ rBo, rT1; \ - ANDQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _ki(oState); \ - \ - MOVQ rBu, rT1; \ - ORQ rBa, rT1; \ - XORQ rBo, rT1; \ - MOVQ rT1, _ko(oState); \ - \ - ANDQ rBe, rBa; \ - XORQ rBu, rBa; \ - MOVQ rBa, _ku(oState); \ - K_RBA_RCU; \ - \ - /* Result m */ \ - MOVQ _ga(iState), rBe; \ - XORQ rDa, rBe; \ - MOVQ _ke(iState), rBi; \ - ROLQ $36, rBe; \ - XORQ rDe, rBi; \ - MOVQ _bu(iState), rBa; \ - ROLQ $10, rBi; \ - MOVQ rBe, rT1; \ - MOVQ _mi(iState), rBo; \ - ANDQ rBi, rT1; \ - XORQ rDu, rBa; \ - MOVQ _so(iState), rBu; \ - ROLQ $27, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ma(oState); \ - M_RT1_RCA; \ - \ - XORQ rDi, rBo; \ - ROLQ $15, rBo; \ - MOVQ rBi, rT1; \ - ORQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _me(oState); \ - M_RT1_RCE; \ - \ - XORQ rDo, rBu; \ - ROLQ $56, rBu; \ - NOTQ rBo; \ - MOVQ rBo, rT1; \ - ORQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _mi(oState); \ - \ - ORQ rBa, rBe; \ - XORQ rBu, rBe; \ - MOVQ rBe, _mu(oState); \ - \ - ANDQ rBa, rBu; \ - XORQ rBo, rBu; \ - MOVQ rBu, _mo(oState); \ - M_RBE_RCU; \ - \ - /* Result s */ \ - MOVQ _bi(iState), rBa; \ - MOVQ _go(iState), rBe; \ - MOVQ _ku(iState), rBi; \ - XORQ rDi, rBa; \ - MOVQ _ma(iState), rBo; \ - ROLQ $62, rBa; \ - XORQ rDo, rBe; \ - MOVQ _se(iState), rBu; \ - ROLQ $55, rBe; \ - \ - XORQ rDu, rBi; \ - MOVQ rBa, rDu; \ - XORQ rDe, rBu; \ - ROLQ $2, rBu; \ - ANDQ rBe, rDu; \ - XORQ rBu, rDu; \ - MOVQ rDu, _su(oState); \ - \ - ROLQ $39, rBi; \ - S_RDU_RCU; \ - NOTQ rBe; \ - XORQ rDa, rBo; \ - MOVQ rBe, rDa; \ - ANDQ rBi, rDa; \ - XORQ rBa, rDa; \ - MOVQ rDa, _sa(oState); \ - S_RDA_RCA; \ - \ - ROLQ $41, rBo; \ - MOVQ rBi, rDe; \ - ORQ rBo, rDe; \ - XORQ rBe, rDe; \ - MOVQ rDe, _se(oState); \ - S_RDE_RCE; \ - \ - MOVQ rBo, rDi; \ - MOVQ rBu, rDo; \ - ANDQ rBu, rDi; \ - ORQ rBa, rDo; \ - XORQ rBi, rDi; \ - XORQ rBo, rDo; \ - MOVQ rDi, _si(oState); \ - MOVQ rDo, _so(oState) \ - // func keccakF1600(a *[25]uint64) -TEXT ·keccakF1600(SB), 0, $200-8 - MOVQ a+0(FP), rpState +TEXT ·keccakF1600(SB), $200-8 + MOVQ a+0(FP), DI // Convert the user state into an internal state - NOTQ _be(rpState) - NOTQ _bi(rpState) - NOTQ _go(rpState) - NOTQ _ki(rpState) - NOTQ _mi(rpState) - NOTQ _sa(rpState) + NOTQ 8(DI) + NOTQ 16(DI) + NOTQ 64(DI) + NOTQ 96(DI) + NOTQ 136(DI) + NOTQ 160(DI) // Execute the KeccakF permutation - MOVQ _ba(rpState), rCa - MOVQ _be(rpState), rCe - MOVQ _bu(rpState), rCu - - XORQ _ga(rpState), rCa - XORQ _ge(rpState), rCe - XORQ _gu(rpState), rCu - - XORQ _ka(rpState), rCa - XORQ _ke(rpState), rCe - XORQ _ku(rpState), rCu - - XORQ _ma(rpState), rCa - XORQ _me(rpState), rCe - XORQ _mu(rpState), rCu - - XORQ _sa(rpState), rCa - XORQ _se(rpState), rCe - MOVQ _si(rpState), rDi - MOVQ _so(rpState), rDo - XORQ _su(rpState), rCu - - mKeccakRound(rpState, rpStack, $0x0000000000000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000000008082, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x800000000000808a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000080008000, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000008a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000000000088, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x0000000080008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x000000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000008000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x800000000000008b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000000008089, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008003, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000000008002, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000000080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000800a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x800000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000080008008, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP) + MOVQ (DI), SI + MOVQ 8(DI), BP + MOVQ 32(DI), R15 + XORQ 40(DI), SI + XORQ 48(DI), BP + XORQ 72(DI), R15 + XORQ 80(DI), SI + XORQ 88(DI), BP + XORQ 112(DI), R15 + XORQ 120(DI), SI + XORQ 128(DI), BP + XORQ 152(DI), R15 + XORQ 160(DI), SI + XORQ 168(DI), BP + MOVQ 176(DI), DX + MOVQ 184(DI), R8 + XORQ 192(DI), R15 - // Revert the internal state to the user state - NOTQ _be(rpState) - NOTQ _bi(rpState) - NOTQ _go(rpState) - NOTQ _ki(rpState) - NOTQ _mi(rpState) - NOTQ _sa(rpState) + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000000000001, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000000008082, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x800000000000808a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008000, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000000000808b, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000080000001, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008081, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008009, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000000000008a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000000000088, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000080008009, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000008000000a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000008000808b, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x800000000000008b, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008089, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008003, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008002, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000000080, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000000000800a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x800000008000000a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008081, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008080, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000080000001, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008008, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + NOP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + NOP + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + NOP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + NOP + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + NOP + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + NOP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + NOP + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + NOP + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + NOP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + NOP + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + NOP + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + NOP + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + NOP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Revert the internal state to the user state + NOTQ 8(DI) + NOTQ 16(DI) + NOTQ 64(DI) + NOTQ 96(DI) + NOTQ 136(DI) + NOTQ 160(DI) RET From 68797222744d17ebda05804c6a5912bd129b8112 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Tue, 16 Jul 2024 14:05:34 -0400 Subject: [PATCH 64/95] ssh: remove go 1.21+ dependency on slices Fixes golang/go#68469 Change-Id: Ieea3c444b0458d169a6ff224e59b3b815264de89 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598775 LUCI-TryBot-Result: Go LUCI Reviewed-by: Nicola Murino Auto-Submit: Nicola Murino Reviewed-by: Cherry Mui Reviewed-by: Carlos Amedee --- ssh/server_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ssh/server_test.go b/ssh/server_test.go index 9057a9b5f0..b6d8ab3333 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -9,7 +9,7 @@ import ( "fmt" "io" "net" - "slices" + "reflect" "strings" "sync/atomic" "testing" @@ -294,7 +294,7 @@ func TestBannerError(t *testing.T) { "banner from PublicKeyCallback", "banner from KeyboardInteractiveCallback", } - if !slices.Equal(banners, wantBanners) { + if !reflect.DeepEqual(banners, wantBanners) { t.Errorf("got banners:\n%q\nwant banners:\n%q", banners, wantBanners) } } From bb80217080b0e04c6e73e5dcd9f3a9bb11fe23f6 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 21 Jul 2024 11:43:44 +0200 Subject: [PATCH 65/95] ssh: don't use dsa keys in integration tests DSA has been disabled by default since OpenSSH 9.8, so tests fail with newer versions of OpenSSH Change-Id: I57b9abde8845cd05116a637a21cbbb8af740b2e0 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/599955 Reviewed-by: Dmitri Shuralyov Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker --- ssh/agent/client_test.go | 10 +++++----- ssh/test/agent_unix_test.go | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ssh/agent/client_test.go b/ssh/agent/client_test.go index ae03df1a63..f0ffd59592 100644 --- a/ssh/agent/client_test.go +++ b/ssh/agent/client_test.go @@ -165,9 +165,9 @@ func testAgentInterface(t *testing.T, agent ExtendedAgent, key interface{}, cert sig, err := agent.Sign(pubKey, data) if err != nil { t.Logf("sign failed with key type %q", pubKey.Type()) - // In integration tests ssh-dss and ssh-rsa (SHA1 signatures) may be - // disabled for security reasons, we check SHA-2 variants later. - if pubKey.Type() != ssh.KeyAlgoDSA && pubKey.Type() != ssh.KeyAlgoRSA && pubKey.Type() != ssh.CertAlgoRSAv01 { + // In integration tests ssh-rsa (SHA1 signatures) may be disabled for + // security reasons, we check SHA-2 variants later. + if pubKey.Type() != ssh.KeyAlgoRSA && pubKey.Type() != ssh.CertAlgoRSAv01 { t.Fatalf("Sign(%s): %v", pubKey.Type(), err) } } else { @@ -251,7 +251,7 @@ func TestMalformedRequests(t *testing.T) { } func TestAgent(t *testing.T) { - for _, keyType := range []string{"rsa", "dsa", "ecdsa", "ed25519"} { + for _, keyType := range []string{"rsa", "ecdsa", "ed25519"} { testOpenSSHAgent(t, testPrivateKeys[keyType], nil, 0) testKeyringAgent(t, testPrivateKeys[keyType], nil, 0) } @@ -409,7 +409,7 @@ func testLockAgent(agent Agent, t *testing.T) { if err := agent.Add(AddedKey{PrivateKey: testPrivateKeys["rsa"], Comment: "comment 1"}); err != nil { t.Errorf("Add: %v", err) } - if err := agent.Add(AddedKey{PrivateKey: testPrivateKeys["dsa"], Comment: "comment dsa"}); err != nil { + if err := agent.Add(AddedKey{PrivateKey: testPrivateKeys["ecdsa"], Comment: "comment ecdsa"}); err != nil { t.Errorf("Add: %v", err) } if keys, err := agent.List(); err != nil { diff --git a/ssh/test/agent_unix_test.go b/ssh/test/agent_unix_test.go index a9c4893f7d..9257bfe1bc 100644 --- a/ssh/test/agent_unix_test.go +++ b/ssh/test/agent_unix_test.go @@ -20,17 +20,17 @@ func TestAgentForward(t *testing.T) { defer conn.Close() keyring := agent.NewKeyring() - if err := keyring.Add(agent.AddedKey{PrivateKey: testPrivateKeys["dsa"]}); err != nil { + if err := keyring.Add(agent.AddedKey{PrivateKey: testPrivateKeys["ecdsa"]}); err != nil { t.Fatalf("Error adding key: %s", err) } if err := keyring.Add(agent.AddedKey{ - PrivateKey: testPrivateKeys["dsa"], + PrivateKey: testPrivateKeys["ecdsa"], ConfirmBeforeUse: true, LifetimeSecs: 3600, }); err != nil { t.Fatalf("Error adding key with constraints: %s", err) } - pub := testPublicKeys["dsa"] + pub := testPublicKeys["ecdsa"] sess, err := conn.NewSession() if err != nil { From 3375612bf41a8cdb0cdfdc21e6ca2c7ae0f764b5 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Tue, 16 Jul 2024 15:48:14 -0400 Subject: [PATCH 66/95] ssh: add support for unpadded RSA signatures The original SSH RFC 4253 explicitly disallows padding. This applies to ssh-rsa signatures. The updated SSH RFC 8332 which defines the SHA2 RSA signature variants explicitly calls out the existence of signers who produce short signatures and specifies that verifiers may allow this behavior. In practice, PuTTY 0.81 and prior versions, as well as SSH.NET prior to 2024.1.0 always generated short signatures. Furthermore, PuTTY is embedded in other software like WinSCP and FileZilla, which are updated on their own schedules as well. This leads to occasional unexplained login errors, when using RSA keys. OpenSSH server allows these short signatures for all RSA algorithms. Fixes golang/go#68286 Change-Id: Ia60ece21bf9c111c490fac0c066443ed5ff7dd29 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598534 Reviewed-by: Nicola Murino Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI --- ssh/keys.go | 44 +++++++++++++++++++++++++++++++++++++++++++- ssh/keys_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/ssh/keys.go b/ssh/keys.go index 7967665f17..98e6706d5d 100644 --- a/ssh/keys.go +++ b/ssh/keys.go @@ -488,7 +488,49 @@ func (r *rsaPublicKey) Verify(data []byte, sig *Signature) error { h := hash.New() h.Write(data) digest := h.Sum(nil) - return rsa.VerifyPKCS1v15((*rsa.PublicKey)(r), hash, digest, sig.Blob) + + // Signatures in PKCS1v15 must match the key's modulus in + // length. However with SSH, some signers provide RSA + // signatures which are missing the MSB 0's of the bignum + // represented. With ssh-rsa signatures, this is encouraged by + // the spec (even though e.g. OpenSSH will give the full + // length unconditionally). With rsa-sha2-* signatures, the + // verifier is allowed to support these, even though they are + // out of spec. See RFC 4253 Section 6.6 for ssh-rsa and RFC + // 8332 Section 3 for rsa-sha2-* details. + // + // In practice: + // * OpenSSH always allows "short" signatures: + // https://github.com/openssh/openssh-portable/blob/V_9_8_P1/ssh-rsa.c#L526 + // but always generates padded signatures: + // https://github.com/openssh/openssh-portable/blob/V_9_8_P1/ssh-rsa.c#L439 + // + // * PuTTY versions 0.81 and earlier will generate short + // signatures for all RSA signature variants. Note that + // PuTTY is embedded in other software, such as WinSCP and + // FileZilla. At the time of writing, a patch has been + // applied to PuTTY to generate padded signatures for + // rsa-sha2-*, but not yet released: + // https://git.tartarus.org/?p=simon/putty.git;a=commitdiff;h=a5bcf3d384e1bf15a51a6923c3724cbbee022d8e + // + // * SSH.NET versions 2024.0.0 and earlier will generate short + // signatures for all RSA signature variants, fixed in 2024.1.0: + // https://github.com/sshnet/SSH.NET/releases/tag/2024.1.0 + // + // As a result, we pad these up to the key size by inserting + // leading 0's. + // + // Note that support for short signatures with rsa-sha2-* may + // be removed in the future due to such signatures not being + // allowed by the spec. + blob := sig.Blob + keySize := (*rsa.PublicKey)(r).Size() + if len(blob) < keySize { + padded := make([]byte, keySize) + copy(padded[keySize-len(blob):], blob) + blob = padded + } + return rsa.VerifyPKCS1v15((*rsa.PublicKey)(r), hash, digest, blob) } func (r *rsaPublicKey) CryptoPublicKey() crypto.PublicKey { diff --git a/ssh/keys_test.go b/ssh/keys_test.go index 7b14429e17..7d5b86ff0d 100644 --- a/ssh/keys_test.go +++ b/ssh/keys_test.go @@ -154,6 +154,44 @@ func TestKeySignWithAlgorithmVerify(t *testing.T) { } } +func TestKeySignWithShortSignature(t *testing.T) { + signer := testSigners["rsa"].(AlgorithmSigner) + pub := signer.PublicKey() + // Note: data obtained by empirically trying until a result + // starting with 0 appeared + tests := []struct { + algorithm string + data []byte + }{ + { + algorithm: KeyAlgoRSA, + data: []byte("sign me92"), + }, + { + algorithm: KeyAlgoRSASHA256, + data: []byte("sign me294"), + }, + { + algorithm: KeyAlgoRSASHA512, + data: []byte("sign me60"), + }, + } + + for _, tt := range tests { + sig, err := signer.SignWithAlgorithm(rand.Reader, tt.data, tt.algorithm) + if err != nil { + t.Fatalf("Sign(%T): %v", signer, err) + } + if sig.Blob[0] != 0 { + t.Errorf("%s: Expected signature with a leading 0", tt.algorithm) + } + sig.Blob = sig.Blob[1:] + if err := pub.Verify(tt.data, sig); err != nil { + t.Errorf("publicKey.Verify(%s): %v", tt.algorithm, err) + } + } +} + func TestParseRSAPrivateKey(t *testing.T) { key := testPrivateKeys["rsa"] From 5bcd010f1cdaf2257509bfb7b43eaad62b7928fd Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Tue, 6 Aug 2024 15:29:05 +0000 Subject: [PATCH 67/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Iae75e5fcbcfe3709820dd66638a763f662f8d939 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/603396 Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: David Chase --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 7e3b4eca74..e206587663 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.22.0 - golang.org/x/term v0.22.0 + golang.org/x/sys v0.23.0 + golang.org/x/term v0.23.0 ) -require golang.org/x/text v0.16.0 // indirect +require golang.org/x/text v0.17.0 // indirect diff --git a/go.sum b/go.sum index 32e3e58b33..c7231381e6 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= -golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= +golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= From b2d3a6a4b4d36521cd7f653879cf6981e7c5c340 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 4 Aug 2024 16:01:34 +0200 Subject: [PATCH 68/95] ssh/agent: ensure to not add duplicated keys When adding a new key, if we already have a Signer with the same public key, we now replace it with the new one instead of duplicating it. Before this change we had this: $ ssh-add -l 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA-CERT) $ ssh-add /home/nicola/ssh_certs/id_rsa Identity added: /home/nicola/ssh_certs/id_rsa (nicola@p1) Certificate added: /home/nicola/ssh_certs/id_rsa-cert.pub (myid) $ ssh-add -l 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA-CERT) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA-CERT) Change-Id: Iad1b1a6dc94f68f53f05d7d1172f0017839976fc Reviewed-on: https://go-review.googlesource.com/c/crypto/+/602955 Reviewed-by: Filippo Valsorda Auto-Submit: Nicola Murino Reviewed-by: David Chase Reviewed-by: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- ssh/agent/keyring.go | 9 ++++++++ ssh/agent/keyring_test.go | 46 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/ssh/agent/keyring.go b/ssh/agent/keyring.go index 21bfa870fa..c1b4361087 100644 --- a/ssh/agent/keyring.go +++ b/ssh/agent/keyring.go @@ -175,6 +175,15 @@ func (r *keyring) Add(key AddedKey) error { p.expire = &t } + // If we already have a Signer with the same public key, replace it with the + // new one. + for idx, k := range r.keys { + if bytes.Equal(k.signer.PublicKey().Marshal(), p.signer.PublicKey().Marshal()) { + r.keys[idx] = p + return nil + } + } + r.keys = append(r.keys, p) return nil diff --git a/ssh/agent/keyring_test.go b/ssh/agent/keyring_test.go index e5d50e7e0d..e9c90a3131 100644 --- a/ssh/agent/keyring_test.go +++ b/ssh/agent/keyring_test.go @@ -29,6 +29,10 @@ func validateListedKeys(t *testing.T, a Agent, expectedKeys []string) { t.Fatalf("failed to list keys: %v", err) return } + if len(listedKeys) != len(expectedKeys) { + t.Fatalf("expeted %d key, got %d", len(expectedKeys), len(listedKeys)) + return + } actualKeys := make(map[string]bool) for _, key := range listedKeys { actualKeys[key.Comment] = true @@ -74,3 +78,45 @@ func TestKeyringAddingAndRemoving(t *testing.T) { } validateListedKeys(t, k, []string{}) } + +func TestAddDuplicateKey(t *testing.T) { + keyNames := []string{"rsa", "user"} + + k := NewKeyring() + for _, keyName := range keyNames { + addTestKey(t, k, keyName) + } + validateListedKeys(t, k, keyNames) + // Add the keys again. + for _, keyName := range keyNames { + addTestKey(t, k, keyName) + } + validateListedKeys(t, k, keyNames) + // Add an existing key with an updated comment. + keyName := keyNames[0] + addedKey := AddedKey{ + PrivateKey: testPrivateKeys[keyName], + Comment: "comment updated", + } + err := k.Add(addedKey) + if err != nil { + t.Fatalf("failed to add key %q: %v", keyName, err) + } + // Check the that key is found and the comment was updated. + keys, err := k.List() + if err != nil { + t.Fatalf("failed to list keys: %v", err) + } + if len(keys) != len(keyNames) { + t.Fatalf("expected %d keys, got %d", len(keyNames), len(keys)) + } + isFound := false + for _, key := range keys { + if key.Comment == addedKey.Comment { + isFound = true + } + } + if !isFound { + t.Fatal("key with the updated comment not found") + } +} From bf5f14f5457a10932761a9abe55eb6e2d05e092a Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 2 Sep 2024 16:01:11 +0000 Subject: [PATCH 69/95] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: I95cf0b3e86f1e013d486a0bbd050a8b4bea5d6e9 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/610060 Reviewed-by: Roland Shoemaker Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- x509roots/fallback/bundle.go | 107 +++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index 35d7c1c550..80ce10fe3c 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -2794,6 +2794,79 @@ I50mD1hp/Ed+stCNi5O/KU9DaXR2Z0vPB4zmAve14bRDtUstFJ/53CYNv6ZHdAbY iNE6KTCEztI5gGIbqMdXSbxqVVFnFUq+NQfk1XWYN3kwFNspnWzFacxHVaIw98xc f8LDmBxrThaA63p4ZUWiABqvDA1VZDRIuJK58bRQKfJPIx/abKwfROHdI3hRW8cW -----END CERTIFICATE----- +# CN=SecureSign Root CA12,O=Cybertrust Japan Co.\, Ltd.,C=JP +# 3f034bb5704d44b2d08545a02057de93ebf3905fce721acbc730c06ddaee904e +-----BEGIN CERTIFICATE----- +MIIDcjCCAlqgAwIBAgIUZvnHwa/swlG07VOX5uaCwysckBYwDQYJKoZIhvcNAQEL +BQAwUTELMAkGA1UEBhMCSlAxIzAhBgNVBAoTGkN5YmVydHJ1c3QgSmFwYW4gQ28u +LCBMdGQuMR0wGwYDVQQDExRTZWN1cmVTaWduIFJvb3QgQ0ExMjAeFw0yMDA0MDgw +NTM2NDZaFw00MDA0MDgwNTM2NDZaMFExCzAJBgNVBAYTAkpQMSMwIQYDVQQKExpD +eWJlcnRydXN0IEphcGFuIENvLiwgTHRkLjEdMBsGA1UEAxMUU2VjdXJlU2lnbiBS +b290IENBMTIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC6OcE3emhF +KxS06+QT61d1I02PJC0W6K6OyX2kVzsqdiUzg2zqMoqUm048luT9Ub+ZyZN+v/mt +p7JIKwccJ/VMvHASd6SFVLX9kHrko+RRWAPNEHl57muTH2SOa2SroxPjcf59q5zd +J1M3s6oYwlkm7Fsf0uZlfO+TvdhYXAvA42VvPMfKWeP+bl+sg779XSVOKik71gur +FzJ4pOE+lEa+Ym6b3kaosRbnhW70CEBFEaCeVESE99g2zvVQR9wsMJvuwPWW0v4J +hscGWa5Pro4RmHvzC1KqYiaqId+OJTN5lxZJjfU+1UefNzFJM3IFTQy2VYzxV4+K +h9GtxRESOaCtAgMBAAGjQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQD +AgEGMB0GA1UdDgQWBBRXNPN0zwRL1SXm8UC2LEzZLemgrTANBgkqhkiG9w0BAQsF +AAOCAQEAPrvbFxbS8hQBICw4g0utvsqFepq2m2um4fylOqyttCg6r9cBg0krY6Ld +mmQOmFxv3Y67ilQiLUoT865AQ9tPkbeGGuwAtEGBpE/6aouIs3YIcipJQMPTw4WJ +mBClnW8Zt7vPemVV2zfrPIpyMpcemik+rY3moxtt9XUa5rBouVui7mlHJzWhhpmA +8zNL4WukJsPvdFlseqJkth5Ew1DgDzk9qTPxpfPSvWKErI4cqc1avTc7bgoitPQV +55FYxTpE05Uo2cBl6XLK0A+9H7MV2anjpEcJnuDLN/v9vZfVvhgaaaI5gdka9at/ +yOPiZwud9AzqVN/Ssq+xIvEg37xEHA== +-----END CERTIFICATE----- +# CN=SecureSign Root CA14,O=Cybertrust Japan Co.\, Ltd.,C=JP +# 4b009c1034494f9ab56bba3ba1d62731fc4d20d8955adcec10a925607261e338 +-----BEGIN CERTIFICATE----- +MIIFcjCCA1qgAwIBAgIUZNtaDCBO6Ncpd8hQJ6JaJ90t8sswDQYJKoZIhvcNAQEM +BQAwUTELMAkGA1UEBhMCSlAxIzAhBgNVBAoTGkN5YmVydHJ1c3QgSmFwYW4gQ28u +LCBMdGQuMR0wGwYDVQQDExRTZWN1cmVTaWduIFJvb3QgQ0ExNDAeFw0yMDA0MDgw +NzA2MTlaFw00NTA0MDgwNzA2MTlaMFExCzAJBgNVBAYTAkpQMSMwIQYDVQQKExpD +eWJlcnRydXN0IEphcGFuIENvLiwgTHRkLjEdMBsGA1UEAxMUU2VjdXJlU2lnbiBS +b290IENBMTQwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDF0nqh1oq/ +FjHQmNE6lPxauG4iwWL3pwon71D2LrGeaBLwbCRjOfHw3xDG3rdSINVSW0KZnvOg +vlIfX8xnbacuUKLBl422+JX1sLrcneC+y9/3OPJH9aaakpUqYllQC6KxNedlsmGy +6pJxaeQp8E+BgQQ8sqVb1MWoWWd7VRxJq3qdwudzTe/NCcLEVxLbAQ4jeQkHO6Lo +/IrPj8BGJJw4J+CDnRugv3gVEOuGTgpa/d/aLIJ+7sr2KeH6caH3iGicnPCNvg9J +kdjqOvn90Ghx2+m1K06Ckm9mH+Dw3EzsytHqunQG+bOEkJTRX45zGRBdAuVwpcAQ +0BB8b8VYSbSwbprafZX1zNoCr7gsfXmPvkPx+SgojQlD+Ajda8iLLCSxjVIHvXib +y8posqTdDEx5YMaZ0ZPxMBoH064iwurO8YQJzOAUbn8/ftKChazcqRZOhaBgy/ac +18izju3Gm5h1DVXoX+WViwKkrkMpKBGk5hIwAUt1ax5mnXkvpXYvHUC0bcl9eQjs +0Wq2XSqypWa9a4X0dFbD9ed1Uigspf9mR6XU/v6eVL9lfgHWMI+lNpyiUBzuOIAB +SMbHdPTGrMNASRZhdCyvjG817XsYAFs2PJxQDcqSMxDxJklt33UkN4Ii1+iW/RVL +ApY+B3KVfqs9TC7XyvDf4Fg/LS8EmjijAQIDAQABo0IwQDAPBgNVHRMBAf8EBTAD +AQH/MA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUBpOjCl4oaTeqYR3r6/wtbyPk +86AwDQYJKoZIhvcNAQEMBQADggIBAJaAcgkGfpzMkwQWu6A6jZJOtxEaCnFxEM0E +rX+lRVAQZk5KQaID2RFPeje5S+LGjzJmdSX7684/AykmjbgWHfYfM25I5uj4V7Ib +ed87hwriZLoAymzvftAj63iP/2SbNDefNWWipAA9EiOWWF3KY4fGoweITedpdopT +zfFP7ELyk+OZpDc8h7hi2/DsHzc/N19DzFGdtfCXwreFamgLRB7lUe6TzktuhsHS +DCRZNhqfLJGP4xjblJUK7ZGqDpncllPjYYPGFrojutzdfhrGe0K22VoF3Jpf1d+4 +2kd92jjbrDnVHmtsKheMYc2xbXIBw8MgAGJoFjHVdqqGuw6qnsb58Nn4DSEC5MUo +FlkRudlpcyqSeLiSV5sI8jrlL5WwWLdrIBRtFO8KvH7YVdiI2i/6GaX7i+B/OfVy +K4XELKzvGUWSTLNhB9xNH27SgRNcmvMSZ4PPmz+Ln52kuaiWA3rF7iDeM9ovnhp6 +dB7h7sxaOgTdsxoEqBRjrLdHEoOabPXm6RUVkRqEGQ6UROcSjiVbgGcZ3GOTEAtl +Lor6CZpO2oYofaphNdgOpygau1LgePhsumywbrmHXumZNTfxPWQrqaA0k89jL9WB +365jJ6UeTo3cKXhZ+PmhIIynJkBugnLNeLLIjzwec+fBH7/PzqUqm9tEZDKgu39c +JRNItX+S +-----END CERTIFICATE----- +# CN=SecureSign Root CA15,O=Cybertrust Japan Co.\, Ltd.,C=JP +# e778f0f095fe843729cd1a0082179e5314a9c291442805e1fb1d8fb6b8886c3a +-----BEGIN CERTIFICATE----- +MIICIzCCAamgAwIBAgIUFhXHw9hJp75pDIqI7fBw+d23PocwCgYIKoZIzj0EAwMw +UTELMAkGA1UEBhMCSlAxIzAhBgNVBAoTGkN5YmVydHJ1c3QgSmFwYW4gQ28uLCBM +dGQuMR0wGwYDVQQDExRTZWN1cmVTaWduIFJvb3QgQ0ExNTAeFw0yMDA0MDgwODMy +NTZaFw00NTA0MDgwODMyNTZaMFExCzAJBgNVBAYTAkpQMSMwIQYDVQQKExpDeWJl +cnRydXN0IEphcGFuIENvLiwgTHRkLjEdMBsGA1UEAxMUU2VjdXJlU2lnbiBSb290 +IENBMTUwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAQLUHSNZDKZmbPSYAi4Io5GdCx4 +wCtELW1fHcmuS1Iggz24FG1Th2CeX2yF2wYUleDHKP+dX+Sq8bOLbe1PL0vJSpSR +ZHX+AezB2Ot6lHhWGENfa4HL9rzatAy2KZMIaY+jQjBAMA8GA1UdEwEB/wQFMAMB +Af8wDgYDVR0PAQH/BAQDAgEGMB0GA1UdDgQWBBTrQciu/NWeUUj1vYv0hyCTQSvT +9DAKBggqhkjOPQQDAwNoADBlAjEA2S6Jfl5OpBEHvVnCB96rMjhTKkZEBhd6zlHp +4P9mLQlO4E/0BdGF9jVg3PVys0Z9AjBEmEYagoUeYWmJSwdLZrWeqrqgHkHZAXQ6 +bkU6iYAZezKYVWOr62Nuk22rGwlgMU4= +-----END CERTIFICATE----- # CN=SecureSign RootCA11,O=Japan Certification Services\, Inc.,C=JP # bf0feefb9e3a581ad5f9e9db7589985743d261085c4d314f6f5d7259aa421612 -----BEGIN CERTIFICATE----- @@ -3062,6 +3135,40 @@ WL6ukK2YJ5f+AbGwUgC4TeQbIXQbfsDuXmkqJa9c1h3a0nnJ85cp4IaH3gRZD/FZ e9eiPZaGzPImNC1qkp2aGtAw4l1OBLBfiyB+d8E9lYLRRpo7PHi4b6HQDWSieB4p TpPDpFQUWw== -----END CERTIFICATE----- +# CN=TWCA CYBER Root CA,OU=Root CA,O=TAIWAN-CA,C=TW +# 3f63bb2814be174ec8b6439cf08d6d56f0b7c405883a5648a334424d6b3ec558 +-----BEGIN CERTIFICATE----- +MIIFjTCCA3WgAwIBAgIQQAE0jMIAAAAAAAAAATzyxjANBgkqhkiG9w0BAQwFADBQ +MQswCQYDVQQGEwJUVzESMBAGA1UEChMJVEFJV0FOLUNBMRAwDgYDVQQLEwdSb290 +IENBMRswGQYDVQQDExJUV0NBIENZQkVSIFJvb3QgQ0EwHhcNMjIxMTIyMDY1NDI5 +WhcNNDcxMTIyMTU1OTU5WjBQMQswCQYDVQQGEwJUVzESMBAGA1UEChMJVEFJV0FO +LUNBMRAwDgYDVQQLEwdSb290IENBMRswGQYDVQQDExJUV0NBIENZQkVSIFJvb3Qg +Q0EwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDG+Moe2Qkgfh1sTs6P +40czRJzHyWmqOlt47nDSkvgEs1JSHWdyKKHfi12VCv7qze33Kc7wb3+szT3vsxxF +avcokPFhV8UMxKNQXd7UtcsZyoC5dc4pztKFIuwCY8xEMCDa6pFbVuYdHNWdZsc/ +34bKS1PE2Y2yHer43CdTo0fhYcx9tbD47nORxc5zb87uEB8aBs/pJ2DFTxnk684i +JkXXYJndzk834H/nY62wuFm40AZoNWDTNq5xQwTxaWV4fPMf88oon1oglWa0zbfu +j3ikRRjpJi+NmykosaS3Om251Bw4ckVYsV7r8Cibt4LK/c/WMw+f+5eesRycnupf +Xtuq3VTpMCEobY5583WSjCb+3MX2w7DfRFlDo7YDKPYIMKoNM+HvnKkHIuNZW0CP +2oi3aQiotyMuRAlZN1vH4xfyIutuOVLF3lSnmMlLIJXcRolftBL5hSmO68gnFSDA +S9TMfAxsNAwmmyYxpjyn9tnQS6Jk/zuZQXLB4HCX8SS7K8R0IrGsayIyJNN4KsDA +oS/xUgXJP+92ZuJF2A09rZXIx4kmyA+upwMu+8Ff+iDhcK2wZSA3M2Cw1a/XDBzC +kHDXShi8fgGwsOsVHkQGzaRP6AzRwyAQ4VRlnrZR0Bp2a0JaWHY06rc3Ga4udfmW +5cFZ95RXKSWNOkyrTZpB0F8mAwIDAQABo2MwYTAOBgNVHQ8BAf8EBAMCAQYwDwYD +VR0TAQH/BAUwAwEB/zAfBgNVHSMEGDAWgBSdhWEUfMFib5do5E83QOGt4A1WNzAd +BgNVHQ4EFgQUnYVhFHzBYm+XaORPN0DhreANVjcwDQYJKoZIhvcNAQEMBQADggIB +AGSPesRiDrWIzLjHhg6hShbNcAu3p4ULs3a2D6f/CIsLJc+o1IN1KriWiLb73y0t +tGlTITVX1olNc79pj3CjYcya2x6a4CD4bLubIp1dhDGaLIrdaqHXKGnK/nZVekZn +68xDiBaiA9a5F/gZbG0jAn/xX9AKKSM70aoK7akXJlQKTcKlTfjF/biBzysseKNn +TKkHmvPfXvt89YnNdJdhEGoHK4Fa0o635yDRIG4kqIQnoVesqlVYL9zZyvpoBJ7t +RCT5dEA7IzOrg1oYJkK2bVS1FmAwbLGg+LhBoF1JSdJlBTrq/p1hvIbZv97Tujqx +f36SNI7JAG7cmL3c7IAFrQI932XtCwP39xaEBDG6k5TY8hL4iuO/Qq+n1M0RFxbI +Qh0UqEL20kCGoE8jypZFVmAGzbdVAaYBlGX+bgUJurSkquLvWL69J1bY73NxW0Qz +8ppy6rBePm6pUlvscG21h483XjyMnM7k8M4MZ0HMzvaAq07MTFb1wWFZk7Q+ptq4 +NxKfKjLji7gh7MMrZQzvIt6IKTtM1/r+t+FHvpw+PoP7UV31aPcuIYXcv/Fa4nzX +xeSDwWrruoBa3lwtcHb4yOWHh8qgnaHlIhInD0Q9HWzq1MKLL295q39QpsQZp6F6 +t5b5wR9iWqJDB0BeJsas7a5wFsWqynKKTbDPAYsDP27X +-----END CERTIFICATE----- # CN=TWCA Global Root CA,OU=Root CA,O=TAIWAN-CA,C=TW # 59769007f7685d0fcd50872f9f95d5755a5b2b457d81f3692b610a98672f0e1b -----BEGIN CERTIFICATE----- From 38a0b5da75b2e8dab0b735296f86d9605e5f92cf Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Mon, 22 Jul 2024 16:52:53 -0400 Subject: [PATCH 70/95] argon2: Avo port of blamka_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="argon2/blamka_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I3567eb80ef80dff248225f17470122c0a4e6951e Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600315 Reviewed-by: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker --- argon2/_asm/blamka_amd64.go | 287 ++++ argon2/_asm/go.mod | 15 + argon2/_asm/go.sum | 12 + argon2/blamka_amd64.s | 2972 ++++++++++++++++++++++++++++++++--- 4 files changed, 3074 insertions(+), 212 deletions(-) create mode 100644 argon2/_asm/blamka_amd64.go create mode 100644 argon2/_asm/go.mod create mode 100644 argon2/_asm/go.sum diff --git a/argon2/_asm/blamka_amd64.go b/argon2/_asm/blamka_amd64.go new file mode 100644 index 0000000000..17a1e7629a --- /dev/null +++ b/argon2/_asm/blamka_amd64.go @@ -0,0 +1,287 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/argon2" +) + +//go:generate go run . -out ../blamka_amd64.s -pkg argon2 + +func main() { + Package("golang.org/x/crypto/argon2") + ConstraintExpr("amd64,gc,!purego") + + blamkaSSE4() + mixBlocksSSE2() + xorBlocksSSE2() + Generate() +} + +func blamkaSSE4() { + Implement("blamkaSSE4") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("b"), RAX) + + c40 := c40_DATA() + c48 := c48_DATA() + + MOVOU(c40, X10) + MOVOU(c48, X11) + + BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) + + BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) + RET() +} + +func mixBlocksSSE2() { + Implement("mixBlocksSSE2") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("out"), RDX) + Load(Param("a"), RAX) + Load(Param("b"), RBX) + Load(Param("c"), RCX) + MOVQ(U32(128), RDI) + + Label("loop") + MOVOU(Mem{Base: AX}.Offset(0), X0) + MOVOU(Mem{Base: BX}.Offset(0), X1) + MOVOU(Mem{Base: CX}.Offset(0), X2) + PXOR(X1, X0) + PXOR(X2, X0) + MOVOU(X0, Mem{Base: DX}.Offset(0)) + ADDQ(Imm(16), RAX) + ADDQ(Imm(16), RBX) + ADDQ(Imm(16), RCX) + ADDQ(Imm(16), RDX) + SUBQ(Imm(2), RDI) + JA(LabelRef("loop")) + RET() +} + +func xorBlocksSSE2() { + Implement("xorBlocksSSE2") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("out"), RDX) + Load(Param("a"), RAX) + Load(Param("b"), RBX) + Load(Param("c"), RCX) + MOVQ(U32(128), RDI) + + Label("loop") + MOVOU(Mem{Base: AX}.Offset(0), X0) + MOVOU(Mem{Base: BX}.Offset(0), X1) + MOVOU(Mem{Base: CX}.Offset(0), X2) + MOVOU(Mem{Base: DX}.Offset(0), X3) + PXOR(X1, X0) + PXOR(X2, X0) + PXOR(X3, X0) + MOVOU(X0, Mem{Base: DX}.Offset(0)) + ADDQ(Imm(16), RAX) + ADDQ(Imm(16), RBX) + ADDQ(Imm(16), RCX) + ADDQ(Imm(16), RDX) + SUBQ(Imm(2), RDI) + JA(LabelRef("loop")) + RET() +} + +func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v6, t1) + PUNPCKLQDQ(v6, t2) + PUNPCKHQDQ(v7, v6) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(v7, t2) + MOVO(t1, v7) + MOVO(v2, t1) + PUNPCKHQDQ(t2, v7) + PUNPCKLQDQ(v3, t2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v3) +} + +func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v2, t1) + PUNPCKLQDQ(v2, t2) + PUNPCKHQDQ(v3, v2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(v3, t2) + MOVO(t1, v3) + MOVO(v6, t1) + PUNPCKHQDQ(t2, v3) + PUNPCKLQDQ(v7, t2) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v7) +} + +func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48 VecPhysical) { + MOVO(v0, t0) + PMULULQ(v2, t0) + PADDQ(v2, v0) + PADDQ(t0, v0) + PADDQ(t0, v0) + PXOR(v0, v6) + PSHUFD(Imm(0xB1), v6, v6) + MOVO(v4, t0) + PMULULQ(v6, t0) + PADDQ(v6, v4) + PADDQ(t0, v4) + PADDQ(t0, v4) + PXOR(v4, v2) + PSHUFB(c40, v2) + MOVO(v0, t0) + PMULULQ(v2, t0) + PADDQ(v2, v0) + PADDQ(t0, v0) + PADDQ(t0, v0) + PXOR(v0, v6) + PSHUFB(c48, v6) + MOVO(v4, t0) + PMULULQ(v6, t0) + PADDQ(v6, v4) + PADDQ(t0, v4) + PADDQ(t0, v4) + PXOR(v4, v2) + MOVO(v2, t0) + PADDQ(v2, t0) + PSRLQ(Imm(63), v2) + PXOR(t0, v2) + MOVO(v1, t0) + PMULULQ(v3, t0) + PADDQ(v3, v1) + PADDQ(t0, v1) + PADDQ(t0, v1) + PXOR(v1, v7) + PSHUFD(Imm(0xB1), v7, v7) + MOVO(v5, t0) + PMULULQ(v7, t0) + PADDQ(v7, v5) + PADDQ(t0, v5) + PADDQ(t0, v5) + PXOR(v5, v3) + PSHUFB(c40, v3) + MOVO(v1, t0) + PMULULQ(v3, t0) + PADDQ(v3, v1) + PADDQ(t0, v1) + PADDQ(t0, v1) + PXOR(v1, v7) + PSHUFB(c48, v7) + MOVO(v5, t0) + PMULULQ(v7, t0) + PADDQ(v7, v5) + PADDQ(t0, v5) + PADDQ(t0, v5) + PXOR(v5, v3) + MOVO(v3, t0) + PADDQ(v3, t0) + PSRLQ(Imm(63), v3) + PXOR(t0, v3) +} + +func LOAD_MSG_0(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(Mem{Base: block}.Offset(8*(off+(i*2))), r) + } +} + +func STORE_MSG_0(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(r, Mem{Base: block}.Offset(8*(off+(i*2)))) + } +} + +func LOAD_MSG_1(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(Mem{Base: block}.Offset(8*off+i*16*8), r) + } +} + +func STORE_MSG_1(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(r, Mem{Base: block}.Offset(8*off+i*16*8)) + } +} + +func BLAMKA_ROUND_0(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) { + LOAD_MSG_0(block, off) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1) + STORE_MSG_0(block, off) +} + +func BLAMKA_ROUND_1(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) { + LOAD_MSG_1(block, off) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1) + STORE_MSG_1(block, off) +} + +// ##------------------DATA SECTION-------------------## + +var c40_DATA_ptr, c48_DATA_ptr *Mem + +func c40_DATA() Mem { + if c40_DATA_ptr != nil { + return *c40_DATA_ptr + } + + c40_DATA := GLOBL("·c40", NOPTR|RODATA) + c40_DATA_ptr = &c40_DATA + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + return c40_DATA +} +func c48_DATA() Mem { + if c48_DATA_ptr != nil { + return *c48_DATA_ptr + } + + c48_DATA := GLOBL("·c48", NOPTR|RODATA) + c48_DATA_ptr = &c48_DATA + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + return c48_DATA +} diff --git a/argon2/_asm/go.mod b/argon2/_asm/go.mod new file mode 100644 index 0000000000..41a536dd77 --- /dev/null +++ b/argon2/_asm/go.mod @@ -0,0 +1,15 @@ +module argon2/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/argon2/_asm/go.sum b/argon2/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/argon2/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/argon2/blamka_amd64.s b/argon2/blamka_amd64.s index 6713accac0..c3895478ed 100644 --- a/argon2/blamka_amd64.s +++ b/argon2/blamka_amd64.s @@ -1,243 +1,2791 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blamka_amd64.go -out ../blamka_amd64.s -pkg argon2. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFD $0xB1, v6, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - PSHUFB c40, v2; \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFB c48, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - MOVO v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v7, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - PSHUFB c40, v3; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFB c48, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - MOVO v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG_0(block, off) \ - MOVOU 8*(off+0)(block), X0; \ - MOVOU 8*(off+2)(block), X1; \ - MOVOU 8*(off+4)(block), X2; \ - MOVOU 8*(off+6)(block), X3; \ - MOVOU 8*(off+8)(block), X4; \ - MOVOU 8*(off+10)(block), X5; \ - MOVOU 8*(off+12)(block), X6; \ - MOVOU 8*(off+14)(block), X7 - -#define STORE_MSG_0(block, off) \ - MOVOU X0, 8*(off+0)(block); \ - MOVOU X1, 8*(off+2)(block); \ - MOVOU X2, 8*(off+4)(block); \ - MOVOU X3, 8*(off+6)(block); \ - MOVOU X4, 8*(off+8)(block); \ - MOVOU X5, 8*(off+10)(block); \ - MOVOU X6, 8*(off+12)(block); \ - MOVOU X7, 8*(off+14)(block) - -#define LOAD_MSG_1(block, off) \ - MOVOU 8*off+0*8(block), X0; \ - MOVOU 8*off+16*8(block), X1; \ - MOVOU 8*off+32*8(block), X2; \ - MOVOU 8*off+48*8(block), X3; \ - MOVOU 8*off+64*8(block), X4; \ - MOVOU 8*off+80*8(block), X5; \ - MOVOU 8*off+96*8(block), X6; \ - MOVOU 8*off+112*8(block), X7 - -#define STORE_MSG_1(block, off) \ - MOVOU X0, 8*off+0*8(block); \ - MOVOU X1, 8*off+16*8(block); \ - MOVOU X2, 8*off+32*8(block); \ - MOVOU X3, 8*off+48*8(block); \ - MOVOU X4, 8*off+64*8(block); \ - MOVOU X5, 8*off+80*8(block); \ - MOVOU X6, 8*off+96*8(block); \ - MOVOU X7, 8*off+112*8(block) - -#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ - LOAD_MSG_0(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_0(block, off) - -#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ - LOAD_MSG_1(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_1(block, off) - // func blamkaSSE4(b *block) -TEXT ·blamkaSSE4(SB), 4, $0-8 - MOVQ b+0(FP), AX - - MOVOU ·c40<>(SB), X10 - MOVOU ·c48<>(SB), X11 +// Requires: SSE2, SSSE3 +TEXT ·blamkaSSE4(SB), NOSPLIT, $0-8 + MOVQ b+0(FP), AX + MOVOU ·c40<>+0(SB), X10 + MOVOU ·c48<>+0(SB), X11 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU 128(AX), X0 + MOVOU 144(AX), X1 + MOVOU 160(AX), X2 + MOVOU 176(AX), X3 + MOVOU 192(AX), X4 + MOVOU 208(AX), X5 + MOVOU 224(AX), X6 + MOVOU 240(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 128(AX) + MOVOU X1, 144(AX) + MOVOU X2, 160(AX) + MOVOU X3, 176(AX) + MOVOU X4, 192(AX) + MOVOU X5, 208(AX) + MOVOU X6, 224(AX) + MOVOU X7, 240(AX) + MOVOU 256(AX), X0 + MOVOU 272(AX), X1 + MOVOU 288(AX), X2 + MOVOU 304(AX), X3 + MOVOU 320(AX), X4 + MOVOU 336(AX), X5 + MOVOU 352(AX), X6 + MOVOU 368(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 256(AX) + MOVOU X1, 272(AX) + MOVOU X2, 288(AX) + MOVOU X3, 304(AX) + MOVOU X4, 320(AX) + MOVOU X5, 336(AX) + MOVOU X6, 352(AX) + MOVOU X7, 368(AX) + MOVOU 384(AX), X0 + MOVOU 400(AX), X1 + MOVOU 416(AX), X2 + MOVOU 432(AX), X3 + MOVOU 448(AX), X4 + MOVOU 464(AX), X5 + MOVOU 480(AX), X6 + MOVOU 496(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 384(AX) + MOVOU X1, 400(AX) + MOVOU X2, 416(AX) + MOVOU X3, 432(AX) + MOVOU X4, 448(AX) + MOVOU X5, 464(AX) + MOVOU X6, 480(AX) + MOVOU X7, 496(AX) + MOVOU 512(AX), X0 + MOVOU 528(AX), X1 + MOVOU 544(AX), X2 + MOVOU 560(AX), X3 + MOVOU 576(AX), X4 + MOVOU 592(AX), X5 + MOVOU 608(AX), X6 + MOVOU 624(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 512(AX) + MOVOU X1, 528(AX) + MOVOU X2, 544(AX) + MOVOU X3, 560(AX) + MOVOU X4, 576(AX) + MOVOU X5, 592(AX) + MOVOU X6, 608(AX) + MOVOU X7, 624(AX) + MOVOU 640(AX), X0 + MOVOU 656(AX), X1 + MOVOU 672(AX), X2 + MOVOU 688(AX), X3 + MOVOU 704(AX), X4 + MOVOU 720(AX), X5 + MOVOU 736(AX), X6 + MOVOU 752(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 640(AX) + MOVOU X1, 656(AX) + MOVOU X2, 672(AX) + MOVOU X3, 688(AX) + MOVOU X4, 704(AX) + MOVOU X5, 720(AX) + MOVOU X6, 736(AX) + MOVOU X7, 752(AX) + MOVOU 768(AX), X0 + MOVOU 784(AX), X1 + MOVOU 800(AX), X2 + MOVOU 816(AX), X3 + MOVOU 832(AX), X4 + MOVOU 848(AX), X5 + MOVOU 864(AX), X6 + MOVOU 880(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 768(AX) + MOVOU X1, 784(AX) + MOVOU X2, 800(AX) + MOVOU X3, 816(AX) + MOVOU X4, 832(AX) + MOVOU X5, 848(AX) + MOVOU X6, 864(AX) + MOVOU X7, 880(AX) + MOVOU 896(AX), X0 + MOVOU 912(AX), X1 + MOVOU 928(AX), X2 + MOVOU 944(AX), X3 + MOVOU 960(AX), X4 + MOVOU 976(AX), X5 + MOVOU 992(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 896(AX) + MOVOU X1, 912(AX) + MOVOU X2, 928(AX) + MOVOU X3, 944(AX) + MOVOU X4, 960(AX) + MOVOU X5, 976(AX) + MOVOU X6, 992(AX) + MOVOU X7, 1008(AX) + MOVOU (AX), X0 + MOVOU 128(AX), X1 + MOVOU 256(AX), X2 + MOVOU 384(AX), X3 + MOVOU 512(AX), X4 + MOVOU 640(AX), X5 + MOVOU 768(AX), X6 + MOVOU 896(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 128(AX) + MOVOU X2, 256(AX) + MOVOU X3, 384(AX) + MOVOU X4, 512(AX) + MOVOU X5, 640(AX) + MOVOU X6, 768(AX) + MOVOU X7, 896(AX) + MOVOU 16(AX), X0 + MOVOU 144(AX), X1 + MOVOU 272(AX), X2 + MOVOU 400(AX), X3 + MOVOU 528(AX), X4 + MOVOU 656(AX), X5 + MOVOU 784(AX), X6 + MOVOU 912(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 16(AX) + MOVOU X1, 144(AX) + MOVOU X2, 272(AX) + MOVOU X3, 400(AX) + MOVOU X4, 528(AX) + MOVOU X5, 656(AX) + MOVOU X6, 784(AX) + MOVOU X7, 912(AX) + MOVOU 32(AX), X0 + MOVOU 160(AX), X1 + MOVOU 288(AX), X2 + MOVOU 416(AX), X3 + MOVOU 544(AX), X4 + MOVOU 672(AX), X5 + MOVOU 800(AX), X6 + MOVOU 928(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 32(AX) + MOVOU X1, 160(AX) + MOVOU X2, 288(AX) + MOVOU X3, 416(AX) + MOVOU X4, 544(AX) + MOVOU X5, 672(AX) + MOVOU X6, 800(AX) + MOVOU X7, 928(AX) + MOVOU 48(AX), X0 + MOVOU 176(AX), X1 + MOVOU 304(AX), X2 + MOVOU 432(AX), X3 + MOVOU 560(AX), X4 + MOVOU 688(AX), X5 + MOVOU 816(AX), X6 + MOVOU 944(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 48(AX) + MOVOU X1, 176(AX) + MOVOU X2, 304(AX) + MOVOU X3, 432(AX) + MOVOU X4, 560(AX) + MOVOU X5, 688(AX) + MOVOU X6, 816(AX) + MOVOU X7, 944(AX) + MOVOU 64(AX), X0 + MOVOU 192(AX), X1 + MOVOU 320(AX), X2 + MOVOU 448(AX), X3 + MOVOU 576(AX), X4 + MOVOU 704(AX), X5 + MOVOU 832(AX), X6 + MOVOU 960(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 64(AX) + MOVOU X1, 192(AX) + MOVOU X2, 320(AX) + MOVOU X3, 448(AX) + MOVOU X4, 576(AX) + MOVOU X5, 704(AX) + MOVOU X6, 832(AX) + MOVOU X7, 960(AX) + MOVOU 80(AX), X0 + MOVOU 208(AX), X1 + MOVOU 336(AX), X2 + MOVOU 464(AX), X3 + MOVOU 592(AX), X4 + MOVOU 720(AX), X5 + MOVOU 848(AX), X6 + MOVOU 976(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 80(AX) + MOVOU X1, 208(AX) + MOVOU X2, 336(AX) + MOVOU X3, 464(AX) + MOVOU X4, 592(AX) + MOVOU X5, 720(AX) + MOVOU X6, 848(AX) + MOVOU X7, 976(AX) + MOVOU 96(AX), X0 + MOVOU 224(AX), X1 + MOVOU 352(AX), X2 + MOVOU 480(AX), X3 + MOVOU 608(AX), X4 + MOVOU 736(AX), X5 + MOVOU 864(AX), X6 + MOVOU 992(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 96(AX) + MOVOU X1, 224(AX) + MOVOU X2, 352(AX) + MOVOU X3, 480(AX) + MOVOU X4, 608(AX) + MOVOU X5, 736(AX) + MOVOU X6, 864(AX) + MOVOU X7, 992(AX) + MOVOU 112(AX), X0 + MOVOU 240(AX), X1 + MOVOU 368(AX), X2 + MOVOU 496(AX), X3 + MOVOU 624(AX), X4 + MOVOU 752(AX), X5 + MOVOU 880(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 112(AX) + MOVOU X1, 240(AX) + MOVOU X2, 368(AX) + MOVOU X3, 496(AX) + MOVOU X4, 624(AX) + MOVOU X5, 752(AX) + MOVOU X6, 880(AX) + MOVOU X7, 1008(AX) + RET - BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) - RET +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 -// func mixBlocksSSE2(out, a, b, c *block) -TEXT ·mixBlocksSSE2(SB), 4, $0-32 +// func mixBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·mixBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 PXOR X1, X0 PXOR X2, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET -// func xorBlocksSSE2(out, a, b, c *block) -TEXT ·xorBlocksSSE2(SB), 4, $0-32 +// func xorBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·xorBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 - MOVOU 0(DX), X3 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 + MOVOU (DX), X3 PXOR X1, X0 PXOR X2, X0 PXOR X3, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET From 38ed1bc0ecdc7fbeadbc7337dc44667688eac800 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Thu, 25 Jul 2024 22:25:00 -0400 Subject: [PATCH 71/95] blake2s: port blake2s_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="blake2s/blake2s_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: Ica8bf9f0b42dc93714aa54e783fa74ed19e6b9f4 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601216 Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Filippo Valsorda --- blake2s/_asm/blake2s_amd64_asm.go | 525 ++++++ blake2s/_asm/go.mod | 15 + blake2s/_asm/go.sum | 12 + blake2s/blake2s_amd64.s | 2571 ++++++++++++++++++++++++----- 4 files changed, 2708 insertions(+), 415 deletions(-) create mode 100644 blake2s/_asm/blake2s_amd64_asm.go create mode 100644 blake2s/_asm/go.mod create mode 100644 blake2s/_asm/go.sum diff --git a/blake2s/_asm/blake2s_amd64_asm.go b/blake2s/_asm/blake2s_amd64_asm.go new file mode 100644 index 0000000000..48ddd6118a --- /dev/null +++ b/blake2s/_asm/blake2s_amd64_asm.go @@ -0,0 +1,525 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/blake2s" +) + +//go:generate go run . -out ../blake2s_amd64.s -pkg blake2s + +func main() { + Package("golang.org/x/crypto/blake2s") + ConstraintExpr("amd64,gc,!purego") + hashBlocksSSE2() + hashBlocksSSSE3() + hashBlocksSSE4() + Generate() +} + +func ROTL_SSE2(n uint64, t, v VecPhysical) { + MOVO(v, t) + PSLLL(Imm(n), t) + PSRLL(Imm(32-n), v) + PXOR(t, v) +} + +func ROTL_SSSE3(c, v VecPhysical) { + PSHUFB(c, v) +} + +func ROUND_SSE2(v0, v1, v2, v3 VecPhysical, m0, m1, m2, m3 Mem, t VecPhysical) { + PADDL(m0, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(16, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m1, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(24, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v1, v1) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v3, v3) + PADDL(m2, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(16, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m3, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(24, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v3, v3) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v1, v1) +} + +func ROUND_SSSE3(v0, v1, v2, v3 VecPhysical, m0, m1, m2, m3 Op, t, c16, c8 VecPhysical) { + PADDL(m0, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c16, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m1, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c8, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v1, v1) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v3, v3) + PADDL(m2, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c16, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m3, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c8, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v3, v3) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v1, v1) +} + +func LOAD_MSG_SSE4(m0, m1, m2, m3 VecPhysical, src GPPhysical, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 int) { + // Hack to get Avo to emit a MOVL instruction with a VecPhysical as the destination + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i0 * 4), m0}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i1*4), m0) + PINSRD(Imm(2), Mem{Base: src}.Offset(i2*4), m0) + PINSRD(Imm(3), Mem{Base: src}.Offset(i3*4), m0) + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i4 * 4), m1}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i5*4), m1) + PINSRD(Imm(2), Mem{Base: src}.Offset(i6*4), m1) + PINSRD(Imm(3), Mem{Base: src}.Offset(i7*4), m1) + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i8 * 4), m2}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i9*4), m2) + PINSRD(Imm(2), Mem{Base: src}.Offset(i10*4), m2) + PINSRD(Imm(3), Mem{Base: src}.Offset(i11*4), m2) + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i12 * 4), m3}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i13*4), m3) + PINSRD(Imm(2), Mem{Base: src}.Offset(i14*4), m3) + PINSRD(Imm(3), Mem{Base: src}.Offset(i15*4), m3) +} + +func PRECOMPUTE_MSG(dst GPPhysical, off int, src, R8, R9, R10, R11, R12, R13, R14, R15 GPPhysical) { + MOVQ(Mem{Base: src}.Offset(0*4), R8) + MOVQ(Mem{Base: src}.Offset(2*4), R9) + MOVQ(Mem{Base: src}.Offset(4*4), R10) + MOVQ(Mem{Base: src}.Offset(6*4), R11) + MOVQ(Mem{Base: src}.Offset(8*4), R12) + MOVQ(Mem{Base: src}.Offset(10*4), R13) + MOVQ(Mem{Base: src}.Offset(12*4), R14) + MOVQ(Mem{Base: src}.Offset(14*4), R15) + + MOVL(R8L, Mem{Base: dst}.Offset(0*4+off+0)) + MOVL(R8L, Mem{Base: dst}.Offset(9*4+off+64)) + MOVL(R8L, Mem{Base: dst}.Offset(5*4+off+128)) + MOVL(R8L, Mem{Base: dst}.Offset(14*4+off+192)) + MOVL(R8L, Mem{Base: dst}.Offset(4*4+off+256)) + MOVL(R8L, Mem{Base: dst}.Offset(2*4+off+320)) + MOVL(R8L, Mem{Base: dst}.Offset(8*4+off+384)) + MOVL(R8L, Mem{Base: dst}.Offset(12*4+off+448)) + MOVL(R8L, Mem{Base: dst}.Offset(3*4+off+512)) + MOVL(R8L, Mem{Base: dst}.Offset(15*4+off+576)) + SHRQ(Imm(32), R8) + MOVL(R8L, Mem{Base: dst}.Offset(4*4+off+0)) + MOVL(R8L, Mem{Base: dst}.Offset(8*4+off+64)) + MOVL(R8L, Mem{Base: dst}.Offset(14*4+off+128)) + MOVL(R8L, Mem{Base: dst}.Offset(5*4+off+192)) + MOVL(R8L, Mem{Base: dst}.Offset(12*4+off+256)) + MOVL(R8L, Mem{Base: dst}.Offset(11*4+off+320)) + MOVL(R8L, Mem{Base: dst}.Offset(1*4+off+384)) + MOVL(R8L, Mem{Base: dst}.Offset(6*4+off+448)) + MOVL(R8L, Mem{Base: dst}.Offset(10*4+off+512)) + MOVL(R8L, Mem{Base: dst}.Offset(3*4+off+576)) + + MOVL(R9L, Mem{Base: dst}.Offset(1*4+off+0)) + MOVL(R9L, Mem{Base: dst}.Offset(13*4+off+64)) + MOVL(R9L, Mem{Base: dst}.Offset(6*4+off+128)) + MOVL(R9L, Mem{Base: dst}.Offset(8*4+off+192)) + MOVL(R9L, Mem{Base: dst}.Offset(2*4+off+256)) + MOVL(R9L, Mem{Base: dst}.Offset(0*4+off+320)) + MOVL(R9L, Mem{Base: dst}.Offset(14*4+off+384)) + MOVL(R9L, Mem{Base: dst}.Offset(11*4+off+448)) + MOVL(R9L, Mem{Base: dst}.Offset(12*4+off+512)) + MOVL(R9L, Mem{Base: dst}.Offset(4*4+off+576)) + SHRQ(Imm(32), R9) + MOVL(R9L, Mem{Base: dst}.Offset(5*4+off+0)) + MOVL(R9L, Mem{Base: dst}.Offset(15*4+off+64)) + MOVL(R9L, Mem{Base: dst}.Offset(9*4+off+128)) + MOVL(R9L, Mem{Base: dst}.Offset(1*4+off+192)) + MOVL(R9L, Mem{Base: dst}.Offset(11*4+off+256)) + MOVL(R9L, Mem{Base: dst}.Offset(7*4+off+320)) + MOVL(R9L, Mem{Base: dst}.Offset(13*4+off+384)) + MOVL(R9L, Mem{Base: dst}.Offset(3*4+off+448)) + MOVL(R9L, Mem{Base: dst}.Offset(6*4+off+512)) + MOVL(R9L, Mem{Base: dst}.Offset(10*4+off+576)) + + MOVL(R10L, Mem{Base: dst}.Offset(2*4+off+0)) + MOVL(R10L, Mem{Base: dst}.Offset(1*4+off+64)) + MOVL(R10L, Mem{Base: dst}.Offset(15*4+off+128)) + MOVL(R10L, Mem{Base: dst}.Offset(10*4+off+192)) + MOVL(R10L, Mem{Base: dst}.Offset(6*4+off+256)) + MOVL(R10L, Mem{Base: dst}.Offset(8*4+off+320)) + MOVL(R10L, Mem{Base: dst}.Offset(3*4+off+384)) + MOVL(R10L, Mem{Base: dst}.Offset(13*4+off+448)) + MOVL(R10L, Mem{Base: dst}.Offset(14*4+off+512)) + MOVL(R10L, Mem{Base: dst}.Offset(5*4+off+576)) + SHRQ(Imm(32), R10) + MOVL(R10L, Mem{Base: dst}.Offset(6*4+off+0)) + MOVL(R10L, Mem{Base: dst}.Offset(11*4+off+64)) + MOVL(R10L, Mem{Base: dst}.Offset(2*4+off+128)) + MOVL(R10L, Mem{Base: dst}.Offset(9*4+off+192)) + MOVL(R10L, Mem{Base: dst}.Offset(1*4+off+256)) + MOVL(R10L, Mem{Base: dst}.Offset(13*4+off+320)) + MOVL(R10L, Mem{Base: dst}.Offset(4*4+off+384)) + MOVL(R10L, Mem{Base: dst}.Offset(8*4+off+448)) + MOVL(R10L, Mem{Base: dst}.Offset(15*4+off+512)) + MOVL(R10L, Mem{Base: dst}.Offset(7*4+off+576)) + + MOVL(R11L, Mem{Base: dst}.Offset(3*4+off+0)) + MOVL(R11L, Mem{Base: dst}.Offset(7*4+off+64)) + MOVL(R11L, Mem{Base: dst}.Offset(13*4+off+128)) + MOVL(R11L, Mem{Base: dst}.Offset(12*4+off+192)) + MOVL(R11L, Mem{Base: dst}.Offset(10*4+off+256)) + MOVL(R11L, Mem{Base: dst}.Offset(1*4+off+320)) + MOVL(R11L, Mem{Base: dst}.Offset(9*4+off+384)) + MOVL(R11L, Mem{Base: dst}.Offset(14*4+off+448)) + MOVL(R11L, Mem{Base: dst}.Offset(0*4+off+512)) + MOVL(R11L, Mem{Base: dst}.Offset(6*4+off+576)) + SHRQ(Imm(32), R11) + MOVL(R11L, Mem{Base: dst}.Offset(7*4+off+0)) + MOVL(R11L, Mem{Base: dst}.Offset(14*4+off+64)) + MOVL(R11L, Mem{Base: dst}.Offset(10*4+off+128)) + MOVL(R11L, Mem{Base: dst}.Offset(0*4+off+192)) + MOVL(R11L, Mem{Base: dst}.Offset(5*4+off+256)) + MOVL(R11L, Mem{Base: dst}.Offset(9*4+off+320)) + MOVL(R11L, Mem{Base: dst}.Offset(12*4+off+384)) + MOVL(R11L, Mem{Base: dst}.Offset(1*4+off+448)) + MOVL(R11L, Mem{Base: dst}.Offset(13*4+off+512)) + MOVL(R11L, Mem{Base: dst}.Offset(2*4+off+576)) + + MOVL(R12L, Mem{Base: dst}.Offset(8*4+off+0)) + MOVL(R12L, Mem{Base: dst}.Offset(5*4+off+64)) + MOVL(R12L, Mem{Base: dst}.Offset(4*4+off+128)) + MOVL(R12L, Mem{Base: dst}.Offset(15*4+off+192)) + MOVL(R12L, Mem{Base: dst}.Offset(14*4+off+256)) + MOVL(R12L, Mem{Base: dst}.Offset(3*4+off+320)) + MOVL(R12L, Mem{Base: dst}.Offset(11*4+off+384)) + MOVL(R12L, Mem{Base: dst}.Offset(10*4+off+448)) + MOVL(R12L, Mem{Base: dst}.Offset(7*4+off+512)) + MOVL(R12L, Mem{Base: dst}.Offset(1*4+off+576)) + SHRQ(Imm(32), R12) + MOVL(R12L, Mem{Base: dst}.Offset(12*4+off+0)) + MOVL(R12L, Mem{Base: dst}.Offset(2*4+off+64)) + MOVL(R12L, Mem{Base: dst}.Offset(11*4+off+128)) + MOVL(R12L, Mem{Base: dst}.Offset(4*4+off+192)) + MOVL(R12L, Mem{Base: dst}.Offset(0*4+off+256)) + MOVL(R12L, Mem{Base: dst}.Offset(15*4+off+320)) + MOVL(R12L, Mem{Base: dst}.Offset(10*4+off+384)) + MOVL(R12L, Mem{Base: dst}.Offset(7*4+off+448)) + MOVL(R12L, Mem{Base: dst}.Offset(5*4+off+512)) + MOVL(R12L, Mem{Base: dst}.Offset(9*4+off+576)) + + MOVL(R13L, Mem{Base: dst}.Offset(9*4+off+0)) + MOVL(R13L, Mem{Base: dst}.Offset(4*4+off+64)) + MOVL(R13L, Mem{Base: dst}.Offset(8*4+off+128)) + MOVL(R13L, Mem{Base: dst}.Offset(13*4+off+192)) + MOVL(R13L, Mem{Base: dst}.Offset(3*4+off+256)) + MOVL(R13L, Mem{Base: dst}.Offset(5*4+off+320)) + MOVL(R13L, Mem{Base: dst}.Offset(7*4+off+384)) + MOVL(R13L, Mem{Base: dst}.Offset(15*4+off+448)) + MOVL(R13L, Mem{Base: dst}.Offset(11*4+off+512)) + MOVL(R13L, Mem{Base: dst}.Offset(0*4+off+576)) + SHRQ(Imm(32), R13) + MOVL(R13L, Mem{Base: dst}.Offset(13*4+off+0)) + MOVL(R13L, Mem{Base: dst}.Offset(10*4+off+64)) + MOVL(R13L, Mem{Base: dst}.Offset(0*4+off+128)) + MOVL(R13L, Mem{Base: dst}.Offset(3*4+off+192)) + MOVL(R13L, Mem{Base: dst}.Offset(9*4+off+256)) + MOVL(R13L, Mem{Base: dst}.Offset(6*4+off+320)) + MOVL(R13L, Mem{Base: dst}.Offset(15*4+off+384)) + MOVL(R13L, Mem{Base: dst}.Offset(4*4+off+448)) + MOVL(R13L, Mem{Base: dst}.Offset(2*4+off+512)) + MOVL(R13L, Mem{Base: dst}.Offset(12*4+off+576)) + + MOVL(R14L, Mem{Base: dst}.Offset(10*4+off+0)) + MOVL(R14L, Mem{Base: dst}.Offset(12*4+off+64)) + MOVL(R14L, Mem{Base: dst}.Offset(1*4+off+128)) + MOVL(R14L, Mem{Base: dst}.Offset(6*4+off+192)) + MOVL(R14L, Mem{Base: dst}.Offset(13*4+off+256)) + MOVL(R14L, Mem{Base: dst}.Offset(4*4+off+320)) + MOVL(R14L, Mem{Base: dst}.Offset(0*4+off+384)) + MOVL(R14L, Mem{Base: dst}.Offset(2*4+off+448)) + MOVL(R14L, Mem{Base: dst}.Offset(8*4+off+512)) + MOVL(R14L, Mem{Base: dst}.Offset(14*4+off+576)) + SHRQ(Imm(32), R14) + MOVL(R14L, Mem{Base: dst}.Offset(14*4+off+0)) + MOVL(R14L, Mem{Base: dst}.Offset(3*4+off+64)) + MOVL(R14L, Mem{Base: dst}.Offset(7*4+off+128)) + MOVL(R14L, Mem{Base: dst}.Offset(2*4+off+192)) + MOVL(R14L, Mem{Base: dst}.Offset(15*4+off+256)) + MOVL(R14L, Mem{Base: dst}.Offset(12*4+off+320)) + MOVL(R14L, Mem{Base: dst}.Offset(6*4+off+384)) + MOVL(R14L, Mem{Base: dst}.Offset(0*4+off+448)) + MOVL(R14L, Mem{Base: dst}.Offset(9*4+off+512)) + MOVL(R14L, Mem{Base: dst}.Offset(11*4+off+576)) + + MOVL(R15L, Mem{Base: dst}.Offset(11*4+off+0)) + MOVL(R15L, Mem{Base: dst}.Offset(0*4+off+64)) + MOVL(R15L, Mem{Base: dst}.Offset(12*4+off+128)) + MOVL(R15L, Mem{Base: dst}.Offset(7*4+off+192)) + MOVL(R15L, Mem{Base: dst}.Offset(8*4+off+256)) + MOVL(R15L, Mem{Base: dst}.Offset(14*4+off+320)) + MOVL(R15L, Mem{Base: dst}.Offset(2*4+off+384)) + MOVL(R15L, Mem{Base: dst}.Offset(5*4+off+448)) + MOVL(R15L, Mem{Base: dst}.Offset(1*4+off+512)) + MOVL(R15L, Mem{Base: dst}.Offset(13*4+off+576)) + SHRQ(Imm(32), R15) + MOVL(R15L, Mem{Base: dst}.Offset(15*4+off+0)) + MOVL(R15L, Mem{Base: dst}.Offset(6*4+off+64)) + MOVL(R15L, Mem{Base: dst}.Offset(3*4+off+128)) + MOVL(R15L, Mem{Base: dst}.Offset(11*4+off+192)) + MOVL(R15L, Mem{Base: dst}.Offset(7*4+off+256)) + MOVL(R15L, Mem{Base: dst}.Offset(10*4+off+320)) + MOVL(R15L, Mem{Base: dst}.Offset(5*4+off+384)) + MOVL(R15L, Mem{Base: dst}.Offset(9*4+off+448)) + MOVL(R15L, Mem{Base: dst}.Offset(4*4+off+512)) + MOVL(R15L, Mem{Base: dst}.Offset(8*4+off+576)) +} + +func BLAKE2s_SSE2() { + PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15) + for i := 0; i < 10; i++ { + ROUND_SSE2(X4, X5, X6, X7, Mem{Base: BP}.Offset(16+64*i), Mem{Base: BP}.Offset(32+64*i), Mem{Base: BP}.Offset(48+64*i), Mem{Base: BP}.Offset(64+64*i), X8) + } +} + +func BLAKE2s_SSSE3() { + PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15) + for i := 0; i < 10; i++ { + ROUND_SSSE3(X4, X5, X6, X7, Mem{Base: BP}.Offset(16+64*i), Mem{Base: BP}.Offset(32+64*i), Mem{Base: BP}.Offset(48+64*i), Mem{Base: BP}.Offset(64+64*i), X8, X13, X14) + } +} + +func BLAKE2s_SSE4() { + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) +} + +func HASH_BLOCKS(h, c, flag, blocks_base, blocks_len Mem, BLAKE2s_FUNC func()) { + MOVQ(h, RAX) + MOVQ(c, RBX) + MOVL(flag, ECX) + MOVQ(blocks_base, RSI) + MOVQ(blocks_len, RDX) + + MOVQ(RSP, RBP) + ADDQ(Imm(15), RBP) + ANDQ(I32(^15), RBP) + + MOVQ(Mem{Base: BX}.Offset(0), R9) + MOVQ(R9, Mem{Base: BP}.Offset(0)) + MOVQ(RCX, Mem{Base: BP}.Offset(8)) + + MOVOU(Mem{Base: AX}.Offset(0), X0) + MOVOU(Mem{Base: AX}.Offset(16), X1) + + iv0 := iv0_DATA() + iv1 := iv1_DATA() + MOVOU(iv0, X2) + MOVOU(iv1, X3) + + counter := counter_DATA() + rol16 := rol16_DATA() + rol8 := rol8_DATA() + MOVOU(counter, X12) + MOVOU(rol16, X13) + MOVOU(rol8, X14) + MOVO(Mem{Base: BP}.Offset(0), X15) + + Label("loop") + MOVO(X0, X4) + MOVO(X1, X5) + MOVO(X2, X6) + MOVO(X3, X7) + + PADDQ(X12, X15) + PXOR(X15, X7) + + BLAKE2s_FUNC() + + PXOR(X4, X0) + PXOR(X5, X1) + PXOR(X6, X0) + PXOR(X7, X1) + + LEAQ(Mem{Base: SI}.Offset(64), RSI) + SUBQ(Imm(64), RDX) + JNE(LabelRef("loop")) + + MOVO(X15, Mem{Base: BP}.Offset(0)) + MOVQ(Mem{Base: BP}.Offset(0), R9) + MOVQ(R9, Mem{Base: BX}.Offset(0)) + + MOVOU(X0, Mem{Base: AX}.Offset(0)) + MOVOU(X1, Mem{Base: AX}.Offset(16)) +} + +func hashBlocksSSE2() { + Implement("hashBlocksSSE2") + Attributes(0) + AllocLocal(672) // frame = 656 + 16 byte alignment + + h := NewParamAddr("h", 0) + c := NewParamAddr("c", 8) + flag := NewParamAddr("flag", 16) + blocks_base := NewParamAddr("blocks_base", 24) + blocks_len := NewParamAddr("blocks_len", 32) + + HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_SSE2) + RET() +} + +func hashBlocksSSSE3() { + Implement("hashBlocksSSSE3") + Attributes(0) + AllocLocal(672) // frame = 656 + 16 byte alignment + + h := NewParamAddr("h", 0) + c := NewParamAddr("c", 8) + flag := NewParamAddr("flag", 16) + blocks_base := NewParamAddr("blocks_base", 24) + blocks_len := NewParamAddr("blocks_len", 32) + + HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_SSSE3) + RET() +} + +func hashBlocksSSE4() { + Implement("hashBlocksSSE4") + Attributes(0) + AllocLocal(32) // frame = 16 + 16 byte alignment + + h := NewParamAddr("h", 0) + c := NewParamAddr("c", 8) + flag := NewParamAddr("flag", 16) + blocks_base := NewParamAddr("blocks_base", 24) + blocks_len := NewParamAddr("blocks_len", 32) + + HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_SSE4) + RET() +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var iv0_DATA_ptr, iv1_DATA_ptr, rol16_DATA_ptr, rol8_DATA_ptr, counter_DATA_ptr *Mem + +func iv0_DATA() Mem { + if iv0_DATA_ptr != nil { + return *iv0_DATA_ptr + } + + iv0_DATA := GLOBL("iv0", NOPTR|RODATA) + iv0_DATA_ptr = &iv0_DATA + DATA(0x00, U32(0x6a09e667)) + DATA(0x04, U32(0xbb67ae85)) + DATA(0x08, U32(0x3c6ef372)) + DATA(0x0c, U32(0xa54ff53a)) + return iv0_DATA +} + +func iv1_DATA() Mem { + if iv1_DATA_ptr != nil { + return *iv1_DATA_ptr + } + + iv1_DATA := GLOBL("iv1", NOPTR|RODATA) + iv1_DATA_ptr = &iv1_DATA + DATA(0x00, U32(0x510e527f)) + DATA(0x04, U32(0x9b05688c)) + DATA(0x08, U32(0x1f83d9ab)) + DATA(0x0c, U32(0x5be0cd19)) + return iv1_DATA +} + +func rol16_DATA() Mem { + if rol16_DATA_ptr != nil { + return *rol16_DATA_ptr + } + + rol16_DATA := GLOBL("rol16", NOPTR|RODATA) + rol16_DATA_ptr = &rol16_DATA + DATA(0x00, U64(0x0504070601000302)) + DATA(0x08, U64(0x0D0C0F0E09080B0A)) + return rol16_DATA +} + +func rol8_DATA() Mem { + if rol8_DATA_ptr != nil { + return *rol8_DATA_ptr + } + + rol8_DATA := GLOBL("rol8", NOPTR|RODATA) + rol8_DATA_ptr = &rol8_DATA + DATA(0x00, U64(0x0407060500030201)) + DATA(0x08, U64(0x0C0F0E0D080B0A09)) + return rol8_DATA +} + +func counter_DATA() Mem { + if counter_DATA_ptr != nil { + return *counter_DATA_ptr + } + + counter_DATA := GLOBL("counter", NOPTR|RODATA) + counter_DATA_ptr = &counter_DATA + DATA(0x00, U64(0x0000000000000040)) + DATA(0x08, U64(0x0000000000000000)) + return counter_DATA +} diff --git a/blake2s/_asm/go.mod b/blake2s/_asm/go.mod new file mode 100644 index 0000000000..9bb23e0eb1 --- /dev/null +++ b/blake2s/_asm/go.mod @@ -0,0 +1,15 @@ +module blake2s/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/blake2s/_asm/go.sum b/blake2s/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/blake2s/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/blake2s/blake2s_amd64.s b/blake2s/blake2s_amd64.s index fe4b818a33..57d510fc08 100644 --- a/blake2s/blake2s_amd64.s +++ b/blake2s/blake2s_amd64.s @@ -1,432 +1,2173 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2s_amd64_asm.go -out ../blake2s_amd64.s -pkg blake2s. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA iv0<>+0x00(SB)/4, $0x6a09e667 -DATA iv0<>+0x04(SB)/4, $0xbb67ae85 -DATA iv0<>+0x08(SB)/4, $0x3c6ef372 -DATA iv0<>+0x0c(SB)/4, $0xa54ff53a -GLOBL iv0<>(SB), (NOPTR+RODATA), $16 - -DATA iv1<>+0x00(SB)/4, $0x510e527f -DATA iv1<>+0x04(SB)/4, $0x9b05688c -DATA iv1<>+0x08(SB)/4, $0x1f83d9ab -DATA iv1<>+0x0c(SB)/4, $0x5be0cd19 -GLOBL iv1<>(SB), (NOPTR+RODATA), $16 - -DATA rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL rol16<>(SB), (NOPTR+RODATA), $16 - -DATA rol8<>+0x00(SB)/8, $0x0407060500030201 -DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL rol8<>(SB), (NOPTR+RODATA), $16 - -DATA counter<>+0x00(SB)/8, $0x40 -DATA counter<>+0x08(SB)/8, $0x0 -GLOBL counter<>(SB), (NOPTR+RODATA), $16 - -#define ROTL_SSE2(n, t, v) \ - MOVO v, t; \ - PSLLL $n, t; \ - PSRLL $(32-n), v; \ - PXOR t, v - -#define ROTL_SSSE3(c, v) \ - PSHUFB c, v - -#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \ - PADDL m0, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(16, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m1, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(24, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v1, v1; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v3, v3; \ - PADDL m2, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(16, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m3, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(24, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v3, v3; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v1, v1 - -#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \ - PADDL m0, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c16, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m1, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c8, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v1, v1; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v3, v3; \ - PADDL m2, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c16, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m3, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c8, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v3, v3; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v1, v1 - - -#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \ - MOVL i0*4(src), m0; \ - PINSRD $1, i1*4(src), m0; \ - PINSRD $2, i2*4(src), m0; \ - PINSRD $3, i3*4(src), m0; \ - MOVL i4*4(src), m1; \ - PINSRD $1, i5*4(src), m1; \ - PINSRD $2, i6*4(src), m1; \ - PINSRD $3, i7*4(src), m1; \ - MOVL i8*4(src), m2; \ - PINSRD $1, i9*4(src), m2; \ - PINSRD $2, i10*4(src), m2; \ - PINSRD $3, i11*4(src), m2; \ - MOVL i12*4(src), m3; \ - PINSRD $1, i13*4(src), m3; \ - PINSRD $2, i14*4(src), m3; \ - PINSRD $3, i15*4(src), m3 +// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +// Requires: SSE2 +TEXT ·hashBlocksSSE2(SB), $672-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 -#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \ - MOVQ 0*4(src), R8; \ - MOVQ 2*4(src), R9; \ - MOVQ 4*4(src), R10; \ - MOVQ 6*4(src), R11; \ - MOVQ 8*4(src), R12; \ - MOVQ 10*4(src), R13; \ - MOVQ 12*4(src), R14; \ - MOVQ 14*4(src), R15; \ - \ - MOVL R8, 0*4+off+0(dst); \ - MOVL R8, 9*4+off+64(dst); \ - MOVL R8, 5*4+off+128(dst); \ - MOVL R8, 14*4+off+192(dst); \ - MOVL R8, 4*4+off+256(dst); \ - MOVL R8, 2*4+off+320(dst); \ - MOVL R8, 8*4+off+384(dst); \ - MOVL R8, 12*4+off+448(dst); \ - MOVL R8, 3*4+off+512(dst); \ - MOVL R8, 15*4+off+576(dst); \ - SHRQ $32, R8; \ - MOVL R8, 4*4+off+0(dst); \ - MOVL R8, 8*4+off+64(dst); \ - MOVL R8, 14*4+off+128(dst); \ - MOVL R8, 5*4+off+192(dst); \ - MOVL R8, 12*4+off+256(dst); \ - MOVL R8, 11*4+off+320(dst); \ - MOVL R8, 1*4+off+384(dst); \ - MOVL R8, 6*4+off+448(dst); \ - MOVL R8, 10*4+off+512(dst); \ - MOVL R8, 3*4+off+576(dst); \ - \ - MOVL R9, 1*4+off+0(dst); \ - MOVL R9, 13*4+off+64(dst); \ - MOVL R9, 6*4+off+128(dst); \ - MOVL R9, 8*4+off+192(dst); \ - MOVL R9, 2*4+off+256(dst); \ - MOVL R9, 0*4+off+320(dst); \ - MOVL R9, 14*4+off+384(dst); \ - MOVL R9, 11*4+off+448(dst); \ - MOVL R9, 12*4+off+512(dst); \ - MOVL R9, 4*4+off+576(dst); \ - SHRQ $32, R9; \ - MOVL R9, 5*4+off+0(dst); \ - MOVL R9, 15*4+off+64(dst); \ - MOVL R9, 9*4+off+128(dst); \ - MOVL R9, 1*4+off+192(dst); \ - MOVL R9, 11*4+off+256(dst); \ - MOVL R9, 7*4+off+320(dst); \ - MOVL R9, 13*4+off+384(dst); \ - MOVL R9, 3*4+off+448(dst); \ - MOVL R9, 6*4+off+512(dst); \ - MOVL R9, 10*4+off+576(dst); \ - \ - MOVL R10, 2*4+off+0(dst); \ - MOVL R10, 1*4+off+64(dst); \ - MOVL R10, 15*4+off+128(dst); \ - MOVL R10, 10*4+off+192(dst); \ - MOVL R10, 6*4+off+256(dst); \ - MOVL R10, 8*4+off+320(dst); \ - MOVL R10, 3*4+off+384(dst); \ - MOVL R10, 13*4+off+448(dst); \ - MOVL R10, 14*4+off+512(dst); \ - MOVL R10, 5*4+off+576(dst); \ - SHRQ $32, R10; \ - MOVL R10, 6*4+off+0(dst); \ - MOVL R10, 11*4+off+64(dst); \ - MOVL R10, 2*4+off+128(dst); \ - MOVL R10, 9*4+off+192(dst); \ - MOVL R10, 1*4+off+256(dst); \ - MOVL R10, 13*4+off+320(dst); \ - MOVL R10, 4*4+off+384(dst); \ - MOVL R10, 8*4+off+448(dst); \ - MOVL R10, 15*4+off+512(dst); \ - MOVL R10, 7*4+off+576(dst); \ - \ - MOVL R11, 3*4+off+0(dst); \ - MOVL R11, 7*4+off+64(dst); \ - MOVL R11, 13*4+off+128(dst); \ - MOVL R11, 12*4+off+192(dst); \ - MOVL R11, 10*4+off+256(dst); \ - MOVL R11, 1*4+off+320(dst); \ - MOVL R11, 9*4+off+384(dst); \ - MOVL R11, 14*4+off+448(dst); \ - MOVL R11, 0*4+off+512(dst); \ - MOVL R11, 6*4+off+576(dst); \ - SHRQ $32, R11; \ - MOVL R11, 7*4+off+0(dst); \ - MOVL R11, 14*4+off+64(dst); \ - MOVL R11, 10*4+off+128(dst); \ - MOVL R11, 0*4+off+192(dst); \ - MOVL R11, 5*4+off+256(dst); \ - MOVL R11, 9*4+off+320(dst); \ - MOVL R11, 12*4+off+384(dst); \ - MOVL R11, 1*4+off+448(dst); \ - MOVL R11, 13*4+off+512(dst); \ - MOVL R11, 2*4+off+576(dst); \ - \ - MOVL R12, 8*4+off+0(dst); \ - MOVL R12, 5*4+off+64(dst); \ - MOVL R12, 4*4+off+128(dst); \ - MOVL R12, 15*4+off+192(dst); \ - MOVL R12, 14*4+off+256(dst); \ - MOVL R12, 3*4+off+320(dst); \ - MOVL R12, 11*4+off+384(dst); \ - MOVL R12, 10*4+off+448(dst); \ - MOVL R12, 7*4+off+512(dst); \ - MOVL R12, 1*4+off+576(dst); \ - SHRQ $32, R12; \ - MOVL R12, 12*4+off+0(dst); \ - MOVL R12, 2*4+off+64(dst); \ - MOVL R12, 11*4+off+128(dst); \ - MOVL R12, 4*4+off+192(dst); \ - MOVL R12, 0*4+off+256(dst); \ - MOVL R12, 15*4+off+320(dst); \ - MOVL R12, 10*4+off+384(dst); \ - MOVL R12, 7*4+off+448(dst); \ - MOVL R12, 5*4+off+512(dst); \ - MOVL R12, 9*4+off+576(dst); \ - \ - MOVL R13, 9*4+off+0(dst); \ - MOVL R13, 4*4+off+64(dst); \ - MOVL R13, 8*4+off+128(dst); \ - MOVL R13, 13*4+off+192(dst); \ - MOVL R13, 3*4+off+256(dst); \ - MOVL R13, 5*4+off+320(dst); \ - MOVL R13, 7*4+off+384(dst); \ - MOVL R13, 15*4+off+448(dst); \ - MOVL R13, 11*4+off+512(dst); \ - MOVL R13, 0*4+off+576(dst); \ - SHRQ $32, R13; \ - MOVL R13, 13*4+off+0(dst); \ - MOVL R13, 10*4+off+64(dst); \ - MOVL R13, 0*4+off+128(dst); \ - MOVL R13, 3*4+off+192(dst); \ - MOVL R13, 9*4+off+256(dst); \ - MOVL R13, 6*4+off+320(dst); \ - MOVL R13, 15*4+off+384(dst); \ - MOVL R13, 4*4+off+448(dst); \ - MOVL R13, 2*4+off+512(dst); \ - MOVL R13, 12*4+off+576(dst); \ - \ - MOVL R14, 10*4+off+0(dst); \ - MOVL R14, 12*4+off+64(dst); \ - MOVL R14, 1*4+off+128(dst); \ - MOVL R14, 6*4+off+192(dst); \ - MOVL R14, 13*4+off+256(dst); \ - MOVL R14, 4*4+off+320(dst); \ - MOVL R14, 0*4+off+384(dst); \ - MOVL R14, 2*4+off+448(dst); \ - MOVL R14, 8*4+off+512(dst); \ - MOVL R14, 14*4+off+576(dst); \ - SHRQ $32, R14; \ - MOVL R14, 14*4+off+0(dst); \ - MOVL R14, 3*4+off+64(dst); \ - MOVL R14, 7*4+off+128(dst); \ - MOVL R14, 2*4+off+192(dst); \ - MOVL R14, 15*4+off+256(dst); \ - MOVL R14, 12*4+off+320(dst); \ - MOVL R14, 6*4+off+384(dst); \ - MOVL R14, 0*4+off+448(dst); \ - MOVL R14, 9*4+off+512(dst); \ - MOVL R14, 11*4+off+576(dst); \ - \ - MOVL R15, 11*4+off+0(dst); \ - MOVL R15, 0*4+off+64(dst); \ - MOVL R15, 12*4+off+128(dst); \ - MOVL R15, 7*4+off+192(dst); \ - MOVL R15, 8*4+off+256(dst); \ - MOVL R15, 14*4+off+320(dst); \ - MOVL R15, 2*4+off+384(dst); \ - MOVL R15, 5*4+off+448(dst); \ - MOVL R15, 1*4+off+512(dst); \ - MOVL R15, 13*4+off+576(dst); \ - SHRQ $32, R15; \ - MOVL R15, 15*4+off+0(dst); \ - MOVL R15, 6*4+off+64(dst); \ - MOVL R15, 3*4+off+128(dst); \ - MOVL R15, 11*4+off+192(dst); \ - MOVL R15, 7*4+off+256(dst); \ - MOVL R15, 10*4+off+320(dst); \ - MOVL R15, 5*4+off+384(dst); \ - MOVL R15, 9*4+off+448(dst); \ - MOVL R15, 4*4+off+512(dst); \ - MOVL R15, 8*4+off+576(dst) +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + MOVQ 32(SI), R12 + MOVQ 40(SI), R13 + MOVQ 48(SI), R14 + MOVQ 56(SI), R15 + MOVL R8, 16(BP) + MOVL R8, 116(BP) + MOVL R8, 164(BP) + MOVL R8, 264(BP) + MOVL R8, 288(BP) + MOVL R8, 344(BP) + MOVL R8, 432(BP) + MOVL R8, 512(BP) + MOVL R8, 540(BP) + MOVL R8, 652(BP) + SHRQ $0x20, R8 + MOVL R8, 32(BP) + MOVL R8, 112(BP) + MOVL R8, 200(BP) + MOVL R8, 228(BP) + MOVL R8, 320(BP) + MOVL R8, 380(BP) + MOVL R8, 404(BP) + MOVL R8, 488(BP) + MOVL R8, 568(BP) + MOVL R8, 604(BP) + MOVL R9, 20(BP) + MOVL R9, 132(BP) + MOVL R9, 168(BP) + MOVL R9, 240(BP) + MOVL R9, 280(BP) + MOVL R9, 336(BP) + MOVL R9, 456(BP) + MOVL R9, 508(BP) + MOVL R9, 576(BP) + MOVL R9, 608(BP) + SHRQ $0x20, R9 + MOVL R9, 36(BP) + MOVL R9, 140(BP) + MOVL R9, 180(BP) + MOVL R9, 212(BP) + MOVL R9, 316(BP) + MOVL R9, 364(BP) + MOVL R9, 452(BP) + MOVL R9, 476(BP) + MOVL R9, 552(BP) + MOVL R9, 632(BP) + MOVL R10, 24(BP) + MOVL R10, 84(BP) + MOVL R10, 204(BP) + MOVL R10, 248(BP) + MOVL R10, 296(BP) + MOVL R10, 368(BP) + MOVL R10, 412(BP) + MOVL R10, 516(BP) + MOVL R10, 584(BP) + MOVL R10, 612(BP) + SHRQ $0x20, R10 + MOVL R10, 40(BP) + MOVL R10, 124(BP) + MOVL R10, 152(BP) + MOVL R10, 244(BP) + MOVL R10, 276(BP) + MOVL R10, 388(BP) + MOVL R10, 416(BP) + MOVL R10, 496(BP) + MOVL R10, 588(BP) + MOVL R10, 620(BP) + MOVL R11, 28(BP) + MOVL R11, 108(BP) + MOVL R11, 196(BP) + MOVL R11, 256(BP) + MOVL R11, 312(BP) + MOVL R11, 340(BP) + MOVL R11, 436(BP) + MOVL R11, 520(BP) + MOVL R11, 528(BP) + MOVL R11, 616(BP) + SHRQ $0x20, R11 + MOVL R11, 44(BP) + MOVL R11, 136(BP) + MOVL R11, 184(BP) + MOVL R11, 208(BP) + MOVL R11, 292(BP) + MOVL R11, 372(BP) + MOVL R11, 448(BP) + MOVL R11, 468(BP) + MOVL R11, 580(BP) + MOVL R11, 600(BP) + MOVL R12, 48(BP) + MOVL R12, 100(BP) + MOVL R12, 160(BP) + MOVL R12, 268(BP) + MOVL R12, 328(BP) + MOVL R12, 348(BP) + MOVL R12, 444(BP) + MOVL R12, 504(BP) + MOVL R12, 556(BP) + MOVL R12, 596(BP) + SHRQ $0x20, R12 + MOVL R12, 64(BP) + MOVL R12, 88(BP) + MOVL R12, 188(BP) + MOVL R12, 224(BP) + MOVL R12, 272(BP) + MOVL R12, 396(BP) + MOVL R12, 440(BP) + MOVL R12, 492(BP) + MOVL R12, 548(BP) + MOVL R12, 628(BP) + MOVL R13, 52(BP) + MOVL R13, 96(BP) + MOVL R13, 176(BP) + MOVL R13, 260(BP) + MOVL R13, 284(BP) + MOVL R13, 356(BP) + MOVL R13, 428(BP) + MOVL R13, 524(BP) + MOVL R13, 572(BP) + MOVL R13, 592(BP) + SHRQ $0x20, R13 + MOVL R13, 68(BP) + MOVL R13, 120(BP) + MOVL R13, 144(BP) + MOVL R13, 220(BP) + MOVL R13, 308(BP) + MOVL R13, 360(BP) + MOVL R13, 460(BP) + MOVL R13, 480(BP) + MOVL R13, 536(BP) + MOVL R13, 640(BP) + MOVL R14, 56(BP) + MOVL R14, 128(BP) + MOVL R14, 148(BP) + MOVL R14, 232(BP) + MOVL R14, 324(BP) + MOVL R14, 352(BP) + MOVL R14, 400(BP) + MOVL R14, 472(BP) + MOVL R14, 560(BP) + MOVL R14, 648(BP) + SHRQ $0x20, R14 + MOVL R14, 72(BP) + MOVL R14, 92(BP) + MOVL R14, 172(BP) + MOVL R14, 216(BP) + MOVL R14, 332(BP) + MOVL R14, 384(BP) + MOVL R14, 424(BP) + MOVL R14, 464(BP) + MOVL R14, 564(BP) + MOVL R14, 636(BP) + MOVL R15, 60(BP) + MOVL R15, 80(BP) + MOVL R15, 192(BP) + MOVL R15, 236(BP) + MOVL R15, 304(BP) + MOVL R15, 392(BP) + MOVL R15, 408(BP) + MOVL R15, 484(BP) + MOVL R15, 532(BP) + MOVL R15, 644(BP) + SHRQ $0x20, R15 + MOVL R15, 76(BP) + MOVL R15, 104(BP) + MOVL R15, 156(BP) + MOVL R15, 252(BP) + MOVL R15, 300(BP) + MOVL R15, 376(BP) + MOVL R15, 420(BP) + MOVL R15, 500(BP) + MOVL R15, 544(BP) + MOVL R15, 624(BP) + PADDL 16(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 32(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 48(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 64(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 80(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 96(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 112(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 128(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 144(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 160(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 176(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 192(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 208(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 224(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 240(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 256(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 272(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 288(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 304(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 320(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 336(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 352(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 368(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 384(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 400(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 416(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 432(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 448(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 464(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 480(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 496(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 512(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 528(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 544(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 560(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 576(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 592(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 608(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 624(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 640(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) + RET -#define BLAKE2s_SSE2() \ - PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ - ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8) +DATA iv0<>+0(SB)/4, $0x6a09e667 +DATA iv0<>+4(SB)/4, $0xbb67ae85 +DATA iv0<>+8(SB)/4, $0x3c6ef372 +DATA iv0<>+12(SB)/4, $0xa54ff53a +GLOBL iv0<>(SB), RODATA|NOPTR, $16 -#define BLAKE2s_SSSE3() \ - PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ - ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14) +DATA iv1<>+0(SB)/4, $0x510e527f +DATA iv1<>+4(SB)/4, $0x9b05688c +DATA iv1<>+8(SB)/4, $0x1f83d9ab +DATA iv1<>+12(SB)/4, $0x5be0cd19 +GLOBL iv1<>(SB), RODATA|NOPTR, $16 -#define BLAKE2s_SSE4() \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) +DATA counter<>+0(SB)/8, $0x0000000000000040 +DATA counter<>+8(SB)/8, $0x0000000000000000 +GLOBL counter<>(SB), RODATA|NOPTR, $16 -#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \ - MOVQ h, AX; \ - MOVQ c, BX; \ - MOVL flag, CX; \ - MOVQ blocks_base, SI; \ - MOVQ blocks_len, DX; \ - \ - MOVQ SP, BP; \ - ADDQ $15, BP; \ - ANDQ $~15, BP; \ - \ - MOVQ 0(BX), R9; \ - MOVQ R9, 0(BP); \ - MOVQ CX, 8(BP); \ - \ - MOVOU 0(AX), X0; \ - MOVOU 16(AX), X1; \ - MOVOU iv0<>(SB), X2; \ - MOVOU iv1<>(SB), X3 \ - \ - MOVOU counter<>(SB), X12; \ - MOVOU rol16<>(SB), X13; \ - MOVOU rol8<>(SB), X14; \ - MOVO 0(BP), X15; \ - \ - loop: \ - MOVO X0, X4; \ - MOVO X1, X5; \ - MOVO X2, X6; \ - MOVO X3, X7; \ - \ - PADDQ X12, X15; \ - PXOR X15, X7; \ - \ - BLAKE2s_FUNC(); \ - \ - PXOR X4, X0; \ - PXOR X5, X1; \ - PXOR X6, X0; \ - PXOR X7, X1; \ - \ - LEAQ 64(SI), SI; \ - SUBQ $64, DX; \ - JNE loop; \ - \ - MOVO X15, 0(BP); \ - MOVQ 0(BP), R9; \ - MOVQ R9, 0(BX); \ - \ - MOVOU X0, 0(AX); \ - MOVOU X1, 16(AX) +DATA rol16<>+0(SB)/8, $0x0504070601000302 +DATA rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +GLOBL rol16<>(SB), RODATA|NOPTR, $16 -// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2) - RET +DATA rol8<>+0(SB)/8, $0x0407060500030201 +DATA rol8<>+8(SB)/8, $0x0c0f0e0d080b0a09 +GLOBL rol8<>(SB), RODATA|NOPTR, $16 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3) +// Requires: SSE2, SSSE3 +TEXT ·hashBlocksSSSE3(SB), $672-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 + +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + MOVQ 32(SI), R12 + MOVQ 40(SI), R13 + MOVQ 48(SI), R14 + MOVQ 56(SI), R15 + MOVL R8, 16(BP) + MOVL R8, 116(BP) + MOVL R8, 164(BP) + MOVL R8, 264(BP) + MOVL R8, 288(BP) + MOVL R8, 344(BP) + MOVL R8, 432(BP) + MOVL R8, 512(BP) + MOVL R8, 540(BP) + MOVL R8, 652(BP) + SHRQ $0x20, R8 + MOVL R8, 32(BP) + MOVL R8, 112(BP) + MOVL R8, 200(BP) + MOVL R8, 228(BP) + MOVL R8, 320(BP) + MOVL R8, 380(BP) + MOVL R8, 404(BP) + MOVL R8, 488(BP) + MOVL R8, 568(BP) + MOVL R8, 604(BP) + MOVL R9, 20(BP) + MOVL R9, 132(BP) + MOVL R9, 168(BP) + MOVL R9, 240(BP) + MOVL R9, 280(BP) + MOVL R9, 336(BP) + MOVL R9, 456(BP) + MOVL R9, 508(BP) + MOVL R9, 576(BP) + MOVL R9, 608(BP) + SHRQ $0x20, R9 + MOVL R9, 36(BP) + MOVL R9, 140(BP) + MOVL R9, 180(BP) + MOVL R9, 212(BP) + MOVL R9, 316(BP) + MOVL R9, 364(BP) + MOVL R9, 452(BP) + MOVL R9, 476(BP) + MOVL R9, 552(BP) + MOVL R9, 632(BP) + MOVL R10, 24(BP) + MOVL R10, 84(BP) + MOVL R10, 204(BP) + MOVL R10, 248(BP) + MOVL R10, 296(BP) + MOVL R10, 368(BP) + MOVL R10, 412(BP) + MOVL R10, 516(BP) + MOVL R10, 584(BP) + MOVL R10, 612(BP) + SHRQ $0x20, R10 + MOVL R10, 40(BP) + MOVL R10, 124(BP) + MOVL R10, 152(BP) + MOVL R10, 244(BP) + MOVL R10, 276(BP) + MOVL R10, 388(BP) + MOVL R10, 416(BP) + MOVL R10, 496(BP) + MOVL R10, 588(BP) + MOVL R10, 620(BP) + MOVL R11, 28(BP) + MOVL R11, 108(BP) + MOVL R11, 196(BP) + MOVL R11, 256(BP) + MOVL R11, 312(BP) + MOVL R11, 340(BP) + MOVL R11, 436(BP) + MOVL R11, 520(BP) + MOVL R11, 528(BP) + MOVL R11, 616(BP) + SHRQ $0x20, R11 + MOVL R11, 44(BP) + MOVL R11, 136(BP) + MOVL R11, 184(BP) + MOVL R11, 208(BP) + MOVL R11, 292(BP) + MOVL R11, 372(BP) + MOVL R11, 448(BP) + MOVL R11, 468(BP) + MOVL R11, 580(BP) + MOVL R11, 600(BP) + MOVL R12, 48(BP) + MOVL R12, 100(BP) + MOVL R12, 160(BP) + MOVL R12, 268(BP) + MOVL R12, 328(BP) + MOVL R12, 348(BP) + MOVL R12, 444(BP) + MOVL R12, 504(BP) + MOVL R12, 556(BP) + MOVL R12, 596(BP) + SHRQ $0x20, R12 + MOVL R12, 64(BP) + MOVL R12, 88(BP) + MOVL R12, 188(BP) + MOVL R12, 224(BP) + MOVL R12, 272(BP) + MOVL R12, 396(BP) + MOVL R12, 440(BP) + MOVL R12, 492(BP) + MOVL R12, 548(BP) + MOVL R12, 628(BP) + MOVL R13, 52(BP) + MOVL R13, 96(BP) + MOVL R13, 176(BP) + MOVL R13, 260(BP) + MOVL R13, 284(BP) + MOVL R13, 356(BP) + MOVL R13, 428(BP) + MOVL R13, 524(BP) + MOVL R13, 572(BP) + MOVL R13, 592(BP) + SHRQ $0x20, R13 + MOVL R13, 68(BP) + MOVL R13, 120(BP) + MOVL R13, 144(BP) + MOVL R13, 220(BP) + MOVL R13, 308(BP) + MOVL R13, 360(BP) + MOVL R13, 460(BP) + MOVL R13, 480(BP) + MOVL R13, 536(BP) + MOVL R13, 640(BP) + MOVL R14, 56(BP) + MOVL R14, 128(BP) + MOVL R14, 148(BP) + MOVL R14, 232(BP) + MOVL R14, 324(BP) + MOVL R14, 352(BP) + MOVL R14, 400(BP) + MOVL R14, 472(BP) + MOVL R14, 560(BP) + MOVL R14, 648(BP) + SHRQ $0x20, R14 + MOVL R14, 72(BP) + MOVL R14, 92(BP) + MOVL R14, 172(BP) + MOVL R14, 216(BP) + MOVL R14, 332(BP) + MOVL R14, 384(BP) + MOVL R14, 424(BP) + MOVL R14, 464(BP) + MOVL R14, 564(BP) + MOVL R14, 636(BP) + MOVL R15, 60(BP) + MOVL R15, 80(BP) + MOVL R15, 192(BP) + MOVL R15, 236(BP) + MOVL R15, 304(BP) + MOVL R15, 392(BP) + MOVL R15, 408(BP) + MOVL R15, 484(BP) + MOVL R15, 532(BP) + MOVL R15, 644(BP) + SHRQ $0x20, R15 + MOVL R15, 76(BP) + MOVL R15, 104(BP) + MOVL R15, 156(BP) + MOVL R15, 252(BP) + MOVL R15, 300(BP) + MOVL R15, 376(BP) + MOVL R15, 420(BP) + MOVL R15, 500(BP) + MOVL R15, 544(BP) + MOVL R15, 624(BP) + PADDL 16(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 32(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 48(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 64(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 80(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 96(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 112(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 128(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 144(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 160(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 176(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 192(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 208(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 224(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 240(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 256(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 272(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 288(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 304(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 320(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 336(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 352(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 368(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 384(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 400(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 416(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 432(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 448(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 464(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 480(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 496(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 512(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 528(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 544(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 560(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 576(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 592(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 608(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 624(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 640(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) RET // func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4) +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), $32-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 + +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVL (SI), X8 + PINSRD $0x01, 8(SI), X8 + PINSRD $0x02, 16(SI), X8 + PINSRD $0x03, 24(SI), X8 + MOVL 4(SI), X9 + PINSRD $0x01, 12(SI), X9 + PINSRD $0x02, 20(SI), X9 + PINSRD $0x03, 28(SI), X9 + MOVL 32(SI), X10 + PINSRD $0x01, 40(SI), X10 + PINSRD $0x02, 48(SI), X10 + PINSRD $0x03, 56(SI), X10 + MOVL 36(SI), X11 + PINSRD $0x01, 44(SI), X11 + PINSRD $0x02, 52(SI), X11 + PINSRD $0x03, 60(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 56(SI), X8 + PINSRD $0x01, 16(SI), X8 + PINSRD $0x02, 36(SI), X8 + PINSRD $0x03, 52(SI), X8 + MOVL 40(SI), X9 + PINSRD $0x01, 32(SI), X9 + PINSRD $0x02, 60(SI), X9 + PINSRD $0x03, 24(SI), X9 + MOVL 4(SI), X10 + PINSRD $0x01, (SI), X10 + PINSRD $0x02, 44(SI), X10 + PINSRD $0x03, 20(SI), X10 + MOVL 48(SI), X11 + PINSRD $0x01, 8(SI), X11 + PINSRD $0x02, 28(SI), X11 + PINSRD $0x03, 12(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 44(SI), X8 + PINSRD $0x01, 48(SI), X8 + PINSRD $0x02, 20(SI), X8 + PINSRD $0x03, 60(SI), X8 + MOVL 32(SI), X9 + PINSRD $0x01, (SI), X9 + PINSRD $0x02, 8(SI), X9 + PINSRD $0x03, 52(SI), X9 + MOVL 40(SI), X10 + PINSRD $0x01, 12(SI), X10 + PINSRD $0x02, 28(SI), X10 + PINSRD $0x03, 36(SI), X10 + MOVL 56(SI), X11 + PINSRD $0x01, 24(SI), X11 + PINSRD $0x02, 4(SI), X11 + PINSRD $0x03, 16(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 28(SI), X8 + PINSRD $0x01, 12(SI), X8 + PINSRD $0x02, 52(SI), X8 + PINSRD $0x03, 44(SI), X8 + MOVL 36(SI), X9 + PINSRD $0x01, 4(SI), X9 + PINSRD $0x02, 48(SI), X9 + PINSRD $0x03, 56(SI), X9 + MOVL 8(SI), X10 + PINSRD $0x01, 20(SI), X10 + PINSRD $0x02, 16(SI), X10 + PINSRD $0x03, 60(SI), X10 + MOVL 24(SI), X11 + PINSRD $0x01, 40(SI), X11 + PINSRD $0x02, (SI), X11 + PINSRD $0x03, 32(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 36(SI), X8 + PINSRD $0x01, 20(SI), X8 + PINSRD $0x02, 8(SI), X8 + PINSRD $0x03, 40(SI), X8 + MOVL (SI), X9 + PINSRD $0x01, 28(SI), X9 + PINSRD $0x02, 16(SI), X9 + PINSRD $0x03, 60(SI), X9 + MOVL 56(SI), X10 + PINSRD $0x01, 44(SI), X10 + PINSRD $0x02, 24(SI), X10 + PINSRD $0x03, 12(SI), X10 + MOVL 4(SI), X11 + PINSRD $0x01, 48(SI), X11 + PINSRD $0x02, 32(SI), X11 + PINSRD $0x03, 52(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 8(SI), X8 + PINSRD $0x01, 24(SI), X8 + PINSRD $0x02, (SI), X8 + PINSRD $0x03, 32(SI), X8 + MOVL 48(SI), X9 + PINSRD $0x01, 40(SI), X9 + PINSRD $0x02, 44(SI), X9 + PINSRD $0x03, 12(SI), X9 + MOVL 16(SI), X10 + PINSRD $0x01, 28(SI), X10 + PINSRD $0x02, 60(SI), X10 + PINSRD $0x03, 4(SI), X10 + MOVL 52(SI), X11 + PINSRD $0x01, 20(SI), X11 + PINSRD $0x02, 56(SI), X11 + PINSRD $0x03, 36(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 48(SI), X8 + PINSRD $0x01, 4(SI), X8 + PINSRD $0x02, 56(SI), X8 + PINSRD $0x03, 16(SI), X8 + MOVL 20(SI), X9 + PINSRD $0x01, 60(SI), X9 + PINSRD $0x02, 52(SI), X9 + PINSRD $0x03, 40(SI), X9 + MOVL (SI), X10 + PINSRD $0x01, 24(SI), X10 + PINSRD $0x02, 36(SI), X10 + PINSRD $0x03, 32(SI), X10 + MOVL 28(SI), X11 + PINSRD $0x01, 12(SI), X11 + PINSRD $0x02, 8(SI), X11 + PINSRD $0x03, 44(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 52(SI), X8 + PINSRD $0x01, 28(SI), X8 + PINSRD $0x02, 48(SI), X8 + PINSRD $0x03, 12(SI), X8 + MOVL 44(SI), X9 + PINSRD $0x01, 56(SI), X9 + PINSRD $0x02, 4(SI), X9 + PINSRD $0x03, 36(SI), X9 + MOVL 20(SI), X10 + PINSRD $0x01, 60(SI), X10 + PINSRD $0x02, 32(SI), X10 + PINSRD $0x03, 8(SI), X10 + MOVL (SI), X11 + PINSRD $0x01, 16(SI), X11 + PINSRD $0x02, 24(SI), X11 + PINSRD $0x03, 40(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 24(SI), X8 + PINSRD $0x01, 56(SI), X8 + PINSRD $0x02, 44(SI), X8 + PINSRD $0x03, (SI), X8 + MOVL 60(SI), X9 + PINSRD $0x01, 36(SI), X9 + PINSRD $0x02, 12(SI), X9 + PINSRD $0x03, 32(SI), X9 + MOVL 48(SI), X10 + PINSRD $0x01, 52(SI), X10 + PINSRD $0x02, 4(SI), X10 + PINSRD $0x03, 40(SI), X10 + MOVL 8(SI), X11 + PINSRD $0x01, 28(SI), X11 + PINSRD $0x02, 16(SI), X11 + PINSRD $0x03, 20(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 40(SI), X8 + PINSRD $0x01, 32(SI), X8 + PINSRD $0x02, 28(SI), X8 + PINSRD $0x03, 4(SI), X8 + MOVL 8(SI), X9 + PINSRD $0x01, 16(SI), X9 + PINSRD $0x02, 24(SI), X9 + PINSRD $0x03, 20(SI), X9 + MOVL 60(SI), X10 + PINSRD $0x01, 36(SI), X10 + PINSRD $0x02, 12(SI), X10 + PINSRD $0x03, 52(SI), X10 + MOVL 44(SI), X11 + PINSRD $0x01, 56(SI), X11 + PINSRD $0x02, 48(SI), X11 + PINSRD $0x03, (SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) RET From 0484c26df710dd85a038090191bda8c385afdd49 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Wed, 24 Jul 2024 23:04:06 -0400 Subject: [PATCH 72/95] blake2b: port blake2bAVX2_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="blake2b/blake2bAVX2_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: Ia2af1b82c871e26b89bd9a2d9fb187cc49e18341 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600836 Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI --- blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go | 1228 +++++ blake2b/_asm/AVX2/go.mod | 16 + blake2b/_asm/AVX2/go.sum | 12 + blake2b/blake2bAVX2_amd64.s | 5167 +++++++++++++++++--- 4 files changed, 5747 insertions(+), 676 deletions(-) create mode 100644 blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go create mode 100644 blake2b/_asm/AVX2/go.mod create mode 100644 blake2b/_asm/AVX2/go.sum diff --git a/blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go b/blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go new file mode 100644 index 0000000000..c297c0ca63 --- /dev/null +++ b/blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go @@ -0,0 +1,1228 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/blake2b" +) + +//go:generate go run . -out ../../blake2bAVX2_amd64.s -pkg blake2b + +const ThatPeskyUnicodeDot = "\u00b7" + +func main() { + Package("golang.org/x/crypto/blake2b") + ConstraintExpr("amd64,gc,!purego") + hashBlocksAVX2() + hashBlocksAVX() + Generate() +} + +// Utility function to emit a BYTE instruction +func BYTE(imm Op) { + Instruction(&ir.Instruction{Opcode: "BYTE", Operands: []Op{imm}}) +} + +func VPERMQ_0x39_Y1_Y1() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xc9)) + BYTE(U8(0x39)) +} + +func VPERMQ_0x93_Y1_Y1() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xc9)) + BYTE(U8(0x93)) +} + +func VPERMQ_0x4E_Y2_Y2() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xd2)) + BYTE(U8(0x4e)) +} + +func VPERMQ_0x93_Y3_Y3() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xdb)) + BYTE(U8(0x93)) +} + +func VPERMQ_0x39_Y3_Y3() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xdb)) + BYTE(U8(0x39)) +} + +func ROUND_AVX2(m0, m1, m2, m3 Op, t, c40, c48 VecPhysical) { + VPADDQ(m0, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFD(I8(-79), Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPSHUFB(c40, Y1, Y1) + VPADDQ(m1, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFB(c48, Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPADDQ(Y1, Y1, t) + VPSRLQ(Imm(63), Y1, Y1) + VPXOR(t, Y1, Y1) + VPERMQ_0x39_Y1_Y1() + VPERMQ_0x4E_Y2_Y2() + VPERMQ_0x93_Y3_Y3() + VPADDQ(m2, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFD(I8(-79), Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPSHUFB(c40, Y1, Y1) + VPADDQ(m3, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFB(c48, Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPADDQ(Y1, Y1, t) + VPSRLQ(Imm(63), Y1, Y1) + VPXOR(t, Y1, Y1) + VPERMQ_0x39_Y3_Y3() + VPERMQ_0x4E_Y2_Y2() + VPERMQ_0x93_Y1_Y1() +} + +func VMOVQ_SI_X11_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x1E)) +} + +func VMOVQ_SI_X12_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x26)) +} + +func VMOVQ_SI_X13_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x2E)) +} + +func VMOVQ_SI_X14_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x36)) +} + +func VMOVQ_SI_X15_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x3E)) +} + +func VMOVQ_SI_X11(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x5E)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X12(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x66)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X13(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x6E)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X14(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x76)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X15(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x7E)) + BYTE(U8(n)) +} + +func VPINSRQ_1_SI_X11_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0xA1)) + BYTE(U8(0x22)) + BYTE(U8(0x1E)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X12_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x99)) + BYTE(U8(0x22)) + BYTE(U8(0x26)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X13_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x91)) + BYTE(U8(0x22)) + BYTE(U8(0x2E)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X14_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x89)) + BYTE(U8(0x22)) + BYTE(U8(0x36)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X15_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x81)) + BYTE(U8(0x22)) + BYTE(U8(0x3E)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X11(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0xA1)) + BYTE(U8(0x22)) + BYTE(U8(0x5E)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X12(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x99)) + BYTE(U8(0x22)) + BYTE(U8(0x66)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X13(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x91)) + BYTE(U8(0x22)) + BYTE(U8(0x6E)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X14(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x89)) + BYTE(U8(0x22)) + BYTE(U8(0x76)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X15(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x81)) + BYTE(U8(0x22)) + BYTE(U8(0x7E)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VMOVQ_R8_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x41)) + BYTE(U8(0xF9)) + BYTE(U8(0x6E)) + BYTE(U8(0xF8)) +} + +func VPINSRQ_1_R9_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x43)) + BYTE(U8(0x81)) + BYTE(U8(0x22)) + BYTE(U8(0xF9)) + BYTE(U8(0x01)) +} + +// load msg: +// +// Y12 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y12(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X12(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X12(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) +} + +// load msg: +// +// Y13 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y13(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X13(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X13(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) +} + +// load msg: +// +// Y14 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y14(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X14(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X14(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) +} + +// load msg: +// +// Y15 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y15(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X15(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X15(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() { + VMOVQ_SI_X12_0() + VMOVQ_SI_X11(4 * 8) + VPINSRQ_1_SI_X12(2 * 8) + VPINSRQ_1_SI_X11(6 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) + LOAD_MSG_AVX2_Y13(1, 3, 5, 7) + LOAD_MSG_AVX2_Y14(8, 10, 12, 14) + LOAD_MSG_AVX2_Y15(9, 11, 13, 15) +} + +func LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() { + LOAD_MSG_AVX2_Y12(14, 4, 9, 13) + LOAD_MSG_AVX2_Y13(10, 8, 15, 6) + VMOVQ_SI_X11(11 * 8) + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(0*8), X14) + VPINSRQ_1_SI_X11(5 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) + LOAD_MSG_AVX2_Y15(12, 2, 7, 3) +} + +func LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() { + VMOVQ_SI_X11(5 * 8) + VMOVDQU(Mem{Base: SI}.Offset(11*8), X12) + VPINSRQ_1_SI_X11(15 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) + VMOVQ_SI_X13(8 * 8) + VMOVQ_SI_X11(2 * 8) + VPINSRQ_1_SI_X13_0() + VPINSRQ_1_SI_X11(13 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) + LOAD_MSG_AVX2_Y14(10, 3, 7, 9) + LOAD_MSG_AVX2_Y15(14, 6, 1, 4) +} + +func LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() { + LOAD_MSG_AVX2_Y12(7, 3, 13, 11) + LOAD_MSG_AVX2_Y13(9, 1, 12, 14) + LOAD_MSG_AVX2_Y14(2, 5, 4, 15) + VMOVQ_SI_X15(6 * 8) + VMOVQ_SI_X11_0() + VPINSRQ_1_SI_X15(10 * 8) + VPINSRQ_1_SI_X11(8 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() { + LOAD_MSG_AVX2_Y12(9, 5, 2, 10) + VMOVQ_SI_X13_0() + VMOVQ_SI_X11(4 * 8) + VPINSRQ_1_SI_X13(7 * 8) + VPINSRQ_1_SI_X11(15 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) + LOAD_MSG_AVX2_Y14(14, 11, 6, 3) + LOAD_MSG_AVX2_Y15(1, 12, 8, 13) +} + +func LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() { + VMOVQ_SI_X12(2 * 8) + VMOVQ_SI_X11_0() + VPINSRQ_1_SI_X12(6 * 8) + VPINSRQ_1_SI_X11(8 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) + LOAD_MSG_AVX2_Y13(12, 10, 11, 3) + LOAD_MSG_AVX2_Y14(4, 7, 15, 1) + LOAD_MSG_AVX2_Y15(13, 5, 14, 9) +} + +func LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() { + LOAD_MSG_AVX2_Y12(12, 1, 14, 4) + LOAD_MSG_AVX2_Y13(5, 15, 13, 10) + VMOVQ_SI_X14_0() + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(8*8), X11) + VPINSRQ_1_SI_X14(6 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) + LOAD_MSG_AVX2_Y15(7, 3, 2, 11) +} + +func LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() { + LOAD_MSG_AVX2_Y12(13, 7, 12, 3) + LOAD_MSG_AVX2_Y13(11, 14, 1, 9) + LOAD_MSG_AVX2_Y14(5, 15, 8, 2) + VMOVQ_SI_X15_0() + VMOVQ_SI_X11(6 * 8) + VPINSRQ_1_SI_X15(4 * 8) + VPINSRQ_1_SI_X11(10 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() { + VMOVQ_SI_X12(6 * 8) + VMOVQ_SI_X11(11 * 8) + VPINSRQ_1_SI_X12(14 * 8) + VPINSRQ_1_SI_X11_0() + VINSERTI128(Imm(1), X11, Y12, Y12) + LOAD_MSG_AVX2_Y13(15, 9, 3, 8) + VMOVQ_SI_X11(1 * 8) + VMOVDQU(Mem{Base: SI}.Offset(12*8), X14) + VPINSRQ_1_SI_X11(10 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) + VMOVQ_SI_X15(2 * 8) + VMOVDQU(Mem{Base: SI}.Offset(4*8), X11) + VPINSRQ_1_SI_X15(7 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() { + LOAD_MSG_AVX2_Y12(10, 8, 7, 1) + VMOVQ_SI_X13(2 * 8) + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(5*8), X11) + VPINSRQ_1_SI_X13(4 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) + LOAD_MSG_AVX2_Y14(15, 9, 3, 13) + VMOVQ_SI_X15(11 * 8) + VMOVQ_SI_X11(12 * 8) + VPINSRQ_1_SI_X15(14 * 8) + VPINSRQ_1_SI_X11_0() + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func hashBlocksAVX2() { + Implement("hashBlocksAVX2") + Attributes(4) + AllocLocal(320) // frame size = 288 + 32 byte alignment + + Load(Param("h"), RAX) + Load(Param("c"), RBX) + Load(Param("flag"), RCX) + Load(Param("blocks").Base(), RSI) + Load(Param("blocks").Len(), RDI) + + MOVQ(RSP, RDX) + ADDQ(I32(31), RDX) + ANDQ(I32(^31), RDX) + + MOVQ(RCX, Mem{Base: DX}.Offset(16)) + XORQ(RCX, RCX) + MOVQ(RCX, Mem{Base: DX}.Offset(24)) + + AVX2_c40 := AVX2_c40_DATA() + AVX2_c48 := AVX2_c48_DATA() + VMOVDQU(AVX2_c40, Y4) + VMOVDQU(AVX2_c48, Y5) + + VMOVDQU(Mem{Base: AX}.Offset(0), Y8) + VMOVDQU(Mem{Base: AX}.Offset(32), Y9) + AVX2_iv0 := AVX2_iv0_DATA() + AVX2_iv1 := AVX2_iv1_DATA() + VMOVDQU(AVX2_iv0, Y6) + VMOVDQU(AVX2_iv1, Y7) + + MOVQ(Mem{Base: BX}.Offset(0), R8) + MOVQ(Mem{Base: BX}.Offset(8), R9) + MOVQ(R9, Mem{Base: DX}.Offset(8)) + + loop_AVX2() + noinc_AVX2() +} + +func loop_AVX2() { + Label("loop") + ADDQ(Imm(128), R8) + MOVQ(R8, Mem{Base: DX}.Offset(0)) + CMPQ(R8, Imm(128)) + JGE(LabelRef("noinc")) + INCQ(R9) + MOVQ(R9, Mem{Base: DX}.Offset(8)) +} + +// line 312 +func noinc_AVX2() { + Label("noinc") + VMOVDQA(Y8, Y0) + VMOVDQA(Y9, Y1) + VMOVDQA(Y6, Y2) + VPXOR(Mem{Base: DX}.Offset(0), Y7, Y3) + + LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() + VMOVDQA(Y12, Mem{Base: DX}.Offset(32)) + VMOVDQA(Y13, Mem{Base: DX}.Offset(64)) + VMOVDQA(Y14, Mem{Base: DX}.Offset(96)) + VMOVDQA(Y15, Mem{Base: DX}.Offset(128)) + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() + VMOVDQA(Y12, Mem{Base: DX}.Offset(160)) + VMOVDQA(Y13, Mem{Base: DX}.Offset(192)) + VMOVDQA(Y14, Mem{Base: DX}.Offset(224)) + VMOVDQA(Y15, Mem{Base: DX}.Offset(256)) + + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + + ROUND_AVX2(Mem{Base: DX}.Offset(32), Mem{Base: DX}.Offset(64), Mem{Base: DX}.Offset(96), Mem{Base: DX}.Offset(128), Y10, Y4, Y5) + ROUND_AVX2(Mem{Base: DX}.Offset(160), Mem{Base: DX}.Offset(192), Mem{Base: DX}.Offset(224), Mem{Base: DX}.Offset(256), Y10, Y4, Y5) + + VPXOR(Y0, Y8, Y8) + VPXOR(Y1, Y9, Y9) + VPXOR(Y2, Y8, Y8) + VPXOR(Y3, Y9, Y9) + + LEAQ(Mem{Base: SI}.Offset(128), RSI) + SUBQ(Imm(128), RDI) + JNE(LabelRef("loop")) + + MOVQ(R8, Mem{Base: BX}.Offset(0)) + MOVQ(R9, Mem{Base: BX}.Offset(8)) + + VMOVDQU(Y8, Mem{Base: AX}.Offset(0)) + VMOVDQU(Y9, Mem{Base: AX}.Offset(32)) + VZEROUPPER() + + RET() +} + +func VPUNPCKLQDQ_X2_X2_X15() { + BYTE(U8(0xC5)) + BYTE(U8(0x69)) + BYTE(U8(0x6C)) + BYTE(U8(0xFA)) +} + +func VPUNPCKLQDQ_X3_X3_X15() { + BYTE(U8(0xC5)) + BYTE(U8(0x61)) + BYTE(U8(0x6C)) + BYTE(U8(0xFB)) +} + +func VPUNPCKLQDQ_X7_X7_X15() { + BYTE(U8(0xC5)) + BYTE(U8(0x41)) + BYTE(U8(0x6C)) + BYTE(U8(0xFF)) +} + +func VPUNPCKLQDQ_X13_X13_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x41)) + BYTE(U8(0x11)) + BYTE(U8(0x6C)) + BYTE(U8(0xFD)) +} + +func VPUNPCKLQDQ_X14_X14_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x41)) + BYTE(U8(0x09)) + BYTE(U8(0x6C)) + BYTE(U8(0xFE)) +} + +func VPUNPCKHQDQ_X15_X2_X2() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x69)) + BYTE(U8(0x6D)) + BYTE(U8(0xD7)) +} + +func VPUNPCKHQDQ_X15_X3_X3() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x61)) + BYTE(U8(0x6D)) + BYTE(U8(0xDF)) +} + +func VPUNPCKHQDQ_X15_X6_X6() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x49)) + BYTE(U8(0x6D)) + BYTE(U8(0xF7)) +} + +func VPUNPCKHQDQ_X15_X7_X7() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x41)) + BYTE(U8(0x6D)) + BYTE(U8(0xFF)) +} + +func VPUNPCKHQDQ_X15_X3_X2() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x61)) + BYTE(U8(0x6D)) + BYTE(U8(0xD7)) +} + +func VPUNPCKHQDQ_X15_X7_X6() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x41)) + BYTE(U8(0x6D)) + BYTE(U8(0xF7)) +} + +func VPUNPCKHQDQ_X15_X13_X3() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x11)) + BYTE(U8(0x6D)) + BYTE(U8(0xDF)) +} + +func VPUNPCKHQDQ_X15_X13_X7() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x11)) + BYTE(U8(0x6D)) + BYTE(U8(0xFF)) +} + +func SHUFFLE_AVX() { + VMOVDQA(X6, X13) + VMOVDQA(X2, X14) + VMOVDQA(X4, X6) + VPUNPCKLQDQ_X13_X13_X15() + VMOVDQA(X5, X4) + VMOVDQA(X6, X5) + VPUNPCKHQDQ_X15_X7_X6() + VPUNPCKLQDQ_X7_X7_X15() + VPUNPCKHQDQ_X15_X13_X7() + VPUNPCKLQDQ_X3_X3_X15() + VPUNPCKHQDQ_X15_X2_X2() + VPUNPCKLQDQ_X14_X14_X15() + VPUNPCKHQDQ_X15_X3_X3() +} + +func SHUFFLE_AVX_INV() { + VMOVDQA(X2, X13) + VMOVDQA(X4, X14) + VPUNPCKLQDQ_X2_X2_X15() + VMOVDQA(X5, X4) + VPUNPCKHQDQ_X15_X3_X2() + VMOVDQA(X14, X5) + VPUNPCKLQDQ_X3_X3_X15() + VMOVDQA(X6, X14) + VPUNPCKHQDQ_X15_X13_X3() + VPUNPCKLQDQ_X7_X7_X15() + VPUNPCKHQDQ_X15_X6_X6() + VPUNPCKLQDQ_X14_X14_X15() + VPUNPCKHQDQ_X15_X7_X7() +} + +func HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7 VecPhysical, m0, m1, m2, m3 Op, t0, c40, c48 VecPhysical) { + VPADDQ(m0, v0, v0) + VPADDQ(v2, v0, v0) + VPADDQ(m1, v1, v1) + VPADDQ(v3, v1, v1) + VPXOR(v0, v6, v6) + VPXOR(v1, v7, v7) + VPSHUFD(I8(-79), v6, v6) + VPSHUFD(I8(-79), v7, v7) + VPADDQ(v6, v4, v4) + VPADDQ(v7, v5, v5) + VPXOR(v4, v2, v2) + VPXOR(v5, v3, v3) + VPSHUFB(c40, v2, v2) + VPSHUFB(c40, v3, v3) + VPADDQ(m2, v0, v0) + VPADDQ(v2, v0, v0) + VPADDQ(m3, v1, v1) + VPADDQ(v3, v1, v1) + VPXOR(v0, v6, v6) + VPXOR(v1, v7, v7) + VPSHUFB(c48, v6, v6) + VPSHUFB(c48, v7, v7) + VPADDQ(v6, v4, v4) + VPADDQ(v7, v5, v5) + VPXOR(v4, v2, v2) + VPXOR(v5, v3, v3) + VPADDQ(v2, v2, t0) + VPSRLQ(Imm(63), v2, v2) + VPXOR(t0, v2, v2) + VPADDQ(v3, v3, t0) + VPSRLQ(Imm(63), v3, v3) + VPXOR(t0, v3, v3) +} + +// load msg: +// +// X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) +// +// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 +func LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7 uint8) { + VMOVQ_SI_X12(i0 * 8) + VMOVQ_SI_X13(i2 * 8) + VMOVQ_SI_X14(i4 * 8) + VMOVQ_SI_X15(i6 * 8) + VPINSRQ_1_SI_X12(i1 * 8) + VPINSRQ_1_SI_X13(i3 * 8) + VPINSRQ_1_SI_X14(i5 * 8) + VPINSRQ_1_SI_X15(i7 * 8) +} + +// load msg: +// +// X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) +func LOAD_MSG_AVX_0_2_4_6_1_3_5_7() { + VMOVQ_SI_X12_0() + VMOVQ_SI_X13(4 * 8) + VMOVQ_SI_X14(1 * 8) + VMOVQ_SI_X15(5 * 8) + VPINSRQ_1_SI_X12(2 * 8) + VPINSRQ_1_SI_X13(6 * 8) + VPINSRQ_1_SI_X14(3 * 8) + VPINSRQ_1_SI_X15(7 * 8) +} + +// load msg: +// +// X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) +func LOAD_MSG_AVX_1_0_11_5_12_2_7_3() { + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(0*8), X12) + VMOVQ_SI_X13(11 * 8) + VMOVQ_SI_X14(12 * 8) + VMOVQ_SI_X15(7 * 8) + VPINSRQ_1_SI_X13(5 * 8) + VPINSRQ_1_SI_X14(2 * 8) + VPINSRQ_1_SI_X15(3 * 8) +} + +// load msg: +// +// X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) +func LOAD_MSG_AVX_11_12_5_15_8_0_2_13() { + VMOVDQU(Mem{Base: SI}.Offset(11*8), X12) + VMOVQ_SI_X13(5 * 8) + VMOVQ_SI_X14(8 * 8) + VMOVQ_SI_X15(2 * 8) + VPINSRQ_1_SI_X13(15 * 8) + VPINSRQ_1_SI_X14_0() + VPINSRQ_1_SI_X15(13 * 8) +} + +// load msg: +// +// X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) +func LOAD_MSG_AVX_2_5_4_15_6_10_0_8() { + VMOVQ_SI_X12(2 * 8) + VMOVQ_SI_X13(4 * 8) + VMOVQ_SI_X14(6 * 8) + VMOVQ_SI_X15_0() + VPINSRQ_1_SI_X12(5 * 8) + VPINSRQ_1_SI_X13(15 * 8) + VPINSRQ_1_SI_X14(10 * 8) + VPINSRQ_1_SI_X15(8 * 8) +} + +// load msg: +// +// X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) +func LOAD_MSG_AVX_9_5_2_10_0_7_4_15() { + VMOVQ_SI_X12(9 * 8) + VMOVQ_SI_X13(2 * 8) + VMOVQ_SI_X14_0() + VMOVQ_SI_X15(4 * 8) + VPINSRQ_1_SI_X12(5 * 8) + VPINSRQ_1_SI_X13(10 * 8) + VPINSRQ_1_SI_X14(7 * 8) + VPINSRQ_1_SI_X15(15 * 8) +} + +// load msg: +// +// X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) +func LOAD_MSG_AVX_2_6_0_8_12_10_11_3() { + VMOVQ_SI_X12(2 * 8) + VMOVQ_SI_X13_0() + VMOVQ_SI_X14(12 * 8) + VMOVQ_SI_X15(11 * 8) + VPINSRQ_1_SI_X12(6 * 8) + VPINSRQ_1_SI_X13(8 * 8) + VPINSRQ_1_SI_X14(10 * 8) + VPINSRQ_1_SI_X15(3 * 8) +} + +// load msg: +// +// X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) +func LOAD_MSG_AVX_0_6_9_8_7_3_2_11() { + MOVQ(Mem{Base: SI}.Offset(0*8), X12) + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(8*8), X13) + MOVQ(Mem{Base: SI}.Offset(7*8), X14) + MOVQ(Mem{Base: SI}.Offset(2*8), X15) + VPINSRQ_1_SI_X12(6 * 8) + VPINSRQ_1_SI_X14(3 * 8) + VPINSRQ_1_SI_X15(11 * 8) +} + +// load msg: +// +// X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) +func LOAD_MSG_AVX_6_14_11_0_15_9_3_8() { + MOVQ(Mem{Base: SI}.Offset(6*8), X12) + MOVQ(Mem{Base: SI}.Offset(11*8), X13) + MOVQ(Mem{Base: SI}.Offset(15*8), X14) + MOVQ(Mem{Base: SI}.Offset(3*8), X15) + VPINSRQ_1_SI_X12(14 * 8) + VPINSRQ_1_SI_X13_0() + VPINSRQ_1_SI_X14(9 * 8) + VPINSRQ_1_SI_X15(8 * 8) +} + +// load msg: +// +// X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) +func LOAD_MSG_AVX_5_15_8_2_0_4_6_10() { + MOVQ(Mem{Base: SI}.Offset(5*8), X12) + MOVQ(Mem{Base: SI}.Offset(8*8), X13) + MOVQ(Mem{Base: SI}.Offset(0*8), X14) + MOVQ(Mem{Base: SI}.Offset(6*8), X15) + VPINSRQ_1_SI_X12(15 * 8) + VPINSRQ_1_SI_X13(2 * 8) + VPINSRQ_1_SI_X14(4 * 8) + VPINSRQ_1_SI_X15(10 * 8) +} + +// load msg: +// +// X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) +func LOAD_MSG_AVX_12_13_1_10_2_7_4_5() { + VMOVDQU(Mem{Base: SI}.Offset(12*8), X12) + MOVQ(Mem{Base: SI}.Offset(1*8), X13) + MOVQ(Mem{Base: SI}.Offset(2*8), X14) + VPINSRQ_1_SI_X13(10 * 8) + VPINSRQ_1_SI_X14(7 * 8) + VMOVDQU(Mem{Base: SI}.Offset(4*8), X15) +} + +// load msg: +// +// X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) +func LOAD_MSG_AVX_15_9_3_13_11_14_12_0() { + MOVQ(Mem{Base: SI}.Offset(15*8), X12) + MOVQ(Mem{Base: SI}.Offset(3*8), X13) + MOVQ(Mem{Base: SI}.Offset(11*8), X14) + MOVQ(Mem{Base: SI}.Offset(12*8), X15) + VPINSRQ_1_SI_X12(9 * 8) + VPINSRQ_1_SI_X13(13 * 8) + VPINSRQ_1_SI_X14(14 * 8) + VPINSRQ_1_SI_X15_0() +} + +func hashBlocksAVX() { + Implement("hashBlocksAVX") + Attributes(4) + AllocLocal(288) // frame size = 272 + 16 byte alignment + + Load(Param("h"), RAX) + Load(Param("c"), RBX) + Load(Param("flag"), RCX) + Load(Param("blocks").Base(), RSI) + Load(Param("blocks").Len(), RDI) + + MOVQ(RSP, R10) + ADDQ(Imm(15), R10) + ANDQ(I32(^15), R10) + + AVX_c40 := AVX_c40_DATA() + AVX_c48 := AVX_c48_DATA() + VMOVDQU(AVX_c40, X0) + VMOVDQU(AVX_c48, X1) + VMOVDQA(X0, X8) + VMOVDQA(X1, X9) + + AVX_iv3 := AVX_iv3_DATA() + VMOVDQU(AVX_iv3, X0) + VMOVDQA(X0, Mem{Base: R10}.Offset(0)) + XORQ(RCX, Mem{Base: R10}.Offset(0)) // 0(R10) = ·AVX_iv3 ^ (CX || 0) + + VMOVDQU(Mem{Base: AX}.Offset(0), X10) + VMOVDQU(Mem{Base: AX}.Offset(16), X11) + VMOVDQU(Mem{Base: AX}.Offset(32), X2) + VMOVDQU(Mem{Base: AX}.Offset(48), X3) + + MOVQ(Mem{Base: BX}.Offset(0), R8) + MOVQ(Mem{Base: BX}.Offset(8), R9) + + loop_AVX() + noinc_AVX() +} + +func loop_AVX() { + Label("loop") + ADDQ(Imm(128), R8) + CMPQ(R8, Imm(128)) + JGE(LabelRef("noinc")) + INCQ(R9) +} + +func noinc_AVX() { + Label("noinc") + VMOVQ_R8_X15() + VPINSRQ_1_R9_X15() + + AVX_iv0 := AVX_iv0_DATA() + AVX_iv1 := AVX_iv1_DATA() + AVX_iv2 := AVX_iv2_DATA() + VMOVDQA(X10, X0) + VMOVDQA(X11, X1) + VMOVDQU(AVX_iv0, X4) + VMOVDQU(AVX_iv1, X5) + VMOVDQU(AVX_iv2, X6) + + VPXOR(X15, X6, X6) + VMOVDQA(Mem{Base: R10}.Offset(0), X7) + + LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA(X12, Mem{Base: R10}.Offset(16)) + VMOVDQA(X13, Mem{Base: R10}.Offset(32)) + VMOVDQA(X14, Mem{Base: R10}.Offset(48)) + VMOVDQA(X15, Mem{Base: R10}.Offset(64)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VMOVDQA(X12, Mem{Base: R10}.Offset(80)) + VMOVDQA(X13, Mem{Base: R10}.Offset(96)) + VMOVDQA(X14, Mem{Base: R10}.Offset(112)) + VMOVDQA(X15, Mem{Base: R10}.Offset(128)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VMOVDQA(X12, Mem{Base: R10}.Offset(144)) + VMOVDQA(X13, Mem{Base: R10}.Offset(160)) + VMOVDQA(X14, Mem{Base: R10}.Offset(176)) + VMOVDQA(X15, Mem{Base: R10}.Offset(192)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VMOVDQA(X12, Mem{Base: R10}.Offset(208)) + VMOVDQA(X13, Mem{Base: R10}.Offset(224)) + VMOVDQA(X14, Mem{Base: R10}.Offset(240)) + VMOVDQA(X15, Mem{Base: R10}.Offset(256)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_11_12_5_15_8_0_2_13() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_2_5_4_15_6_10_0_8() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_9_5_2_10_0_7_4_15() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_2_6_0_8_12_10_11_3() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_0_6_9_8_7_3_2_11() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_5_15_8_2_0_4_6_10() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_6_14_11_0_15_9_3_8() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_12_13_1_10_2_7_4_5() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_15_9_3_13_11_14_12_0() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(16), Mem{Base: R10}.Offset(32), Mem{Base: R10}.Offset(48), Mem{Base: R10}.Offset(64), X15, X8, X9) + SHUFFLE_AVX() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(80), Mem{Base: R10}.Offset(96), Mem{Base: R10}.Offset(112), Mem{Base: R10}.Offset(128), X15, X8, X9) + SHUFFLE_AVX_INV() + + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(144), Mem{Base: R10}.Offset(160), Mem{Base: R10}.Offset(176), Mem{Base: R10}.Offset(192), X15, X8, X9) + SHUFFLE_AVX() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(208), Mem{Base: R10}.Offset(224), Mem{Base: R10}.Offset(240), Mem{Base: R10}.Offset(256), X15, X8, X9) + SHUFFLE_AVX_INV() + + VMOVDQU(Mem{Base: AX}.Offset(32), X14) + VMOVDQU(Mem{Base: AX}.Offset(48), X15) + VPXOR(X0, X10, X10) + VPXOR(X1, X11, X11) + VPXOR(X2, X14, X14) + VPXOR(X3, X15, X15) + VPXOR(X4, X10, X10) + VPXOR(X5, X11, X11) + VPXOR(X6, X14, X2) + VPXOR(X7, X15, X3) + VMOVDQU(X2, Mem{Base: AX}.Offset(32)) + VMOVDQU(X3, Mem{Base: AX}.Offset(48)) + + LEAQ(Mem{Base: SI}.Offset(128), RSI) + SUBQ(Imm(128), RDI) + JNE(LabelRef("loop")) + + VMOVDQU(X10, Mem{Base: AX}.Offset(0)) + VMOVDQU(X11, Mem{Base: AX}.Offset(16)) + + MOVQ(R8, Mem{Base: BX}.Offset(0)) + MOVQ(R9, Mem{Base: BX}.Offset(8)) + VZEROUPPER() + + RET() +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var ( + AVX2_iv0_ptr, + AVX2_iv1_ptr, + AVX2_c40_ptr, + AVX2_c48_ptr, + + AVX_iv0_ptr, + AVX_iv1_ptr, + AVX_iv2_ptr, + AVX_iv3_ptr, + AVX_c40_ptr, + AVX_c48_ptr *Mem +) + +func AVX2_iv0_DATA() Mem { + if AVX2_iv0_ptr != nil { + return *AVX2_iv0_ptr + } + AVX2_iv0 := GLOBL(ThatPeskyUnicodeDot+"AVX2_iv0", NOPTR|RODATA) + DATA(0x00, U64(0x6a09e667f3bcc908)) + DATA(0x08, U64(0xbb67ae8584caa73b)) + DATA(0x10, U64(0x3c6ef372fe94f82b)) + DATA(0x18, U64(0xa54ff53a5f1d36f1)) + return AVX2_iv0 +} + +func AVX2_iv1_DATA() Mem { + if AVX2_iv1_ptr != nil { + return *AVX2_iv1_ptr + } + AVX2_iv1 := GLOBL(ThatPeskyUnicodeDot+"AVX2_iv1", NOPTR|RODATA) + DATA(0x00, U64(0x510e527fade682d1)) + DATA(0x08, U64(0x9b05688c2b3e6c1f)) + DATA(0x10, U64(0x1f83d9abfb41bd6b)) + DATA(0x18, U64(0x5be0cd19137e2179)) + return AVX2_iv1 +} + +func AVX2_c40_DATA() Mem { + if AVX2_c40_ptr != nil { + return *AVX2_c40_ptr + } + AVX2_c40 := GLOBL(ThatPeskyUnicodeDot+"AVX2_c40", NOPTR|RODATA) + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + DATA(0x10, U64(0x0201000706050403)) + DATA(0x18, U64(0x0a09080f0e0d0c0b)) + return AVX2_c40 +} + +func AVX2_c48_DATA() Mem { + if AVX2_c48_ptr != nil { + return *AVX2_c48_ptr + } + AVX2_c48 := GLOBL(ThatPeskyUnicodeDot+"AVX2_c48", NOPTR|RODATA) + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + DATA(0x10, U64(0x0100070605040302)) + DATA(0x18, U64(0x09080f0e0d0c0b0a)) + return AVX2_c48 +} + +func AVX_iv0_DATA() Mem { + if AVX_iv0_ptr != nil { + return *AVX_iv0_ptr + } + AVX_iv0 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv0", NOPTR|RODATA) + DATA(0x00, U64(0x6a09e667f3bcc908)) + DATA(0x08, U64(0xbb67ae8584caa73b)) + return AVX_iv0 +} + +func AVX_iv1_DATA() Mem { + if AVX_iv1_ptr != nil { + return *AVX_iv1_ptr + } + AVX_iv1 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv1", NOPTR|RODATA) + DATA(0x00, U64(0x3c6ef372fe94f82b)) + DATA(0x08, U64(0xa54ff53a5f1d36f1)) + return AVX_iv1 +} + +func AVX_iv2_DATA() Mem { + if AVX_iv2_ptr != nil { + return *AVX_iv2_ptr + } + AVX_iv2 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv2", NOPTR|RODATA) + DATA(0x00, U64(0x510e527fade682d1)) + DATA(0x08, U64(0x9b05688c2b3e6c1f)) + return AVX_iv2 +} + +func AVX_iv3_DATA() Mem { + if AVX_iv3_ptr != nil { + return *AVX_iv3_ptr + } + AVX_iv3 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv3", NOPTR|RODATA) + DATA(0x00, U64(0x1f83d9abfb41bd6b)) + DATA(0x08, U64(0x5be0cd19137e2179)) + return AVX_iv3 +} + +func AVX_c40_DATA() Mem { + if AVX_c40_ptr != nil { + return *AVX_c40_ptr + } + AVX_c40 := GLOBL(ThatPeskyUnicodeDot+"AVX_c40", NOPTR|RODATA) + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + return AVX_c40 +} + +func AVX_c48_DATA() Mem { + if AVX_c48_ptr != nil { + return *AVX_c48_ptr + } + AVX_c48 := GLOBL(ThatPeskyUnicodeDot+"AVX_c48", NOPTR|RODATA) + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + return AVX_c48 +} diff --git a/blake2b/_asm/AVX2/go.mod b/blake2b/_asm/AVX2/go.mod new file mode 100644 index 0000000000..c49f1b11ae --- /dev/null +++ b/blake2b/_asm/AVX2/go.mod @@ -0,0 +1,16 @@ +module blake2b/_asm/AVX2 + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 + +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/blake2b/_asm/AVX2/go.sum b/blake2b/_asm/AVX2/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/blake2b/_asm/AVX2/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/blake2b/blake2bAVX2_amd64.s b/blake2b/blake2bAVX2_amd64.s index 9ae8206c20..f75162e039 100644 --- a/blake2b/blake2bAVX2_amd64.s +++ b/blake2b/blake2bAVX2_amd64.s @@ -1,722 +1,4517 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 - -#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 -#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 -#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e -#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 -#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 - -#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ - VPADDQ m0, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m1, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y1_Y1; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y3_Y3; \ - VPADDQ m2, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m3, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y3_Y3; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y1_Y1 - -#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E -#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 -#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E -#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 -#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E - -#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n -#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n -#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n -#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n -#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n - -#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 -#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 -#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 -#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 -#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 - -#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 - -#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 -#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 - -// load msg: Y12 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y12, Y12 - -// load msg: Y13 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ - VMOVQ_SI_X13(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X13(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y13, Y13 - -// load msg: Y14 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ - VMOVQ_SI_X14(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X14(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y14, Y14 - -// load msg: Y15 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ - VMOVQ_SI_X15(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X15(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X11(6*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ - LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ - LOAD_MSG_AVX2_Y15(9, 11, 13, 15) - -#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ - LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ - LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ - VMOVQ_SI_X11(11*8); \ - VPSHUFD $0x4E, 0*8(SI), X14; \ - VPINSRQ_1_SI_X11(5*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(12, 2, 7, 3) - -#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ - VMOVQ_SI_X11(5*8); \ - VMOVDQU 11*8(SI), X12; \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - VMOVQ_SI_X13(8*8); \ - VMOVQ_SI_X11(2*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X11(13*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ - LOAD_MSG_AVX2_Y15(14, 6, 1, 4) - -#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ - LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ - LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ - LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ - VMOVQ_SI_X15(6*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X15(10*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ - LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X13(7*8); \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ - LOAD_MSG_AVX2_Y15(1, 12, 8, 13) - -#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ - LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ - LOAD_MSG_AVX2_Y15(13, 5, 14, 9) - -#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ - LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ - LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ - VMOVQ_SI_X14_0; \ - VPSHUFD $0x4E, 8*8(SI), X11; \ - VPINSRQ_1_SI_X14(6*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(7, 3, 2, 11) - -#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ - LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ - LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ - LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ - VMOVQ_SI_X15_0; \ - VMOVQ_SI_X11(6*8); \ - VPINSRQ_1_SI_X15(4*8); \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ - VMOVQ_SI_X12(6*8); \ - VMOVQ_SI_X11(11*8); \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ - VMOVQ_SI_X11(1*8); \ - VMOVDQU 12*8(SI), X14; \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - VMOVQ_SI_X15(2*8); \ - VMOVDQU 4*8(SI), X11; \ - VPINSRQ_1_SI_X15(7*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ - LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ - VMOVQ_SI_X13(2*8); \ - VPSHUFD $0x4E, 5*8(SI), X11; \ - VPINSRQ_1_SI_X13(4*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ - VMOVQ_SI_X15(11*8); \ - VMOVQ_SI_X11(12*8); \ - VPINSRQ_1_SI_X15(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y15, Y15 - // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, DX - ADDQ $31, DX - ANDQ $~31, DX - - MOVQ CX, 16(DX) - XORQ CX, CX - MOVQ CX, 24(DX) - - VMOVDQU ·AVX2_c40<>(SB), Y4 - VMOVDQU ·AVX2_c48<>(SB), Y5 - - VMOVDQU 0(AX), Y8 +// Requires: AVX, AVX2 +TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, DX + ADDQ $+31, DX + ANDQ $-32, DX + MOVQ CX, 16(DX) + XORQ CX, CX + MOVQ CX, 24(DX) + VMOVDQU ·AVX2_c40<>+0(SB), Y4 + VMOVDQU ·AVX2_c48<>+0(SB), Y5 + VMOVDQU (AX), Y8 VMOVDQU 32(AX), Y9 - VMOVDQU ·AVX2_iv0<>(SB), Y6 - VMOVDQU ·AVX2_iv1<>(SB), Y7 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 - MOVQ R9, 8(DX) + VMOVDQU ·AVX2_iv0<>+0(SB), Y6 + VMOVDQU ·AVX2_iv1<>+0(SB), Y7 + MOVQ (BX), R8 + MOVQ 8(BX), R9 + MOVQ R9, 8(DX) loop: - ADDQ $128, R8 - MOVQ R8, 0(DX) - CMPQ R8, $128 + ADDQ $0x80, R8 + MOVQ R8, (DX) + CMPQ R8, $0x80 JGE noinc INCQ R9 MOVQ R9, 8(DX) noinc: - VMOVDQA Y8, Y0 - VMOVDQA Y9, Y1 - VMOVDQA Y6, Y2 - VPXOR 0(DX), Y7, Y3 - - LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() - VMOVDQA Y12, 32(DX) - VMOVDQA Y13, 64(DX) - VMOVDQA Y14, 96(DX) - VMOVDQA Y15, 128(DX) - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() - VMOVDQA Y12, 160(DX) - VMOVDQA Y13, 192(DX) - VMOVDQA Y14, 224(DX) - VMOVDQA Y15, 256(DX) - - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - - ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) - ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) - - VPXOR Y0, Y8, Y8 - VPXOR Y1, Y9, Y9 - VPXOR Y2, Y8, Y8 - VPXOR Y3, Y9, Y9 - - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop - - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - - VMOVDQU Y8, 0(AX) - VMOVDQU Y9, 32(AX) + VMOVDQA Y8, Y0 + VMOVDQA Y9, Y1 + VMOVDQA Y6, Y2 + VPXOR (DX), Y7, Y3 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 32(DX) + VMOVDQA Y13, 64(DX) + VMOVDQA Y14, 96(DX) + VMOVDQA Y15, 128(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x48 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + VPSHUFD $0x4e, (SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x28 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 160(DX) + VMOVDQA Y13, 192(DX) + VMOVDQA Y14, 224(DX) + VMOVDQA Y15, 256(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + VMOVDQU 88(SI), X12 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + VPSHUFD $0x4e, 64(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x10 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + VMOVDQU 96(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + VMOVDQU 32(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + VPSHUFD $0x4e, 40(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 32(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 64(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 96(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 128(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 160(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 192(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 224(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 256(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPXOR Y0, Y8, Y8 + VPXOR Y1, Y9, Y9 + VPXOR Y2, Y8, Y8 + VPXOR Y3, Y9, Y9 + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VMOVDQU Y8, (AX) + VMOVDQU Y9, 32(AX) VZEROUPPER - RET -#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA -#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB -#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF -#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD -#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE - -#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF -#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF - -#define SHUFFLE_AVX() \ - VMOVDQA X6, X13; \ - VMOVDQA X2, X14; \ - VMOVDQA X4, X6; \ - VPUNPCKLQDQ_X13_X13_X15; \ - VMOVDQA X5, X4; \ - VMOVDQA X6, X5; \ - VPUNPCKHQDQ_X15_X7_X6; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X13_X7; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VPUNPCKHQDQ_X15_X2_X2; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X3_X3; \ - -#define SHUFFLE_AVX_INV() \ - VMOVDQA X2, X13; \ - VMOVDQA X4, X14; \ - VPUNPCKLQDQ_X2_X2_X15; \ - VMOVDQA X5, X4; \ - VPUNPCKHQDQ_X15_X3_X2; \ - VMOVDQA X14, X5; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VMOVDQA X6, X14; \ - VPUNPCKHQDQ_X15_X13_X3; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X6_X6; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X7_X7; \ - -#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - VPADDQ m0, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m1, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFD $-79, v6, v6; \ - VPSHUFD $-79, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPSHUFB c40, v2, v2; \ - VPSHUFB c40, v3, v3; \ - VPADDQ m2, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m3, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFB c48, v6, v6; \ - VPSHUFB c48, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPADDQ v2, v2, t0; \ - VPSRLQ $63, v2, v2; \ - VPXOR t0, v2, v2; \ - VPADDQ v3, v3, t0; \ - VPSRLQ $63, v3, v3; \ - VPXOR t0, v3, v3 - -// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) -// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 -#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X13(i2*8); \ - VMOVQ_SI_X14(i4*8); \ - VMOVQ_SI_X15(i6*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X13(i3*8); \ - VPINSRQ_1_SI_X14(i5*8); \ - VPINSRQ_1_SI_X15(i7*8) - -// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) -#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(1*8); \ - VMOVQ_SI_X15(5*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X13(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(7*8) - -// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) -#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ - VPSHUFD $0x4E, 0*8(SI), X12; \ - VMOVQ_SI_X13(11*8); \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(7*8); \ - VPINSRQ_1_SI_X13(5*8); \ - VPINSRQ_1_SI_X14(2*8); \ - VPINSRQ_1_SI_X15(3*8) - -// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) -#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ - VMOVDQU 11*8(SI), X12; \ - VMOVQ_SI_X13(5*8); \ - VMOVQ_SI_X14(8*8); \ - VMOVQ_SI_X15(2*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14_0; \ - VPINSRQ_1_SI_X15(13*8) - -// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) -#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(6*8); \ - VMOVQ_SI_X15_0; \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(8*8) +DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) -#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ - VMOVQ_SI_X12(9*8); \ - VMOVQ_SI_X13(2*8); \ - VMOVQ_SI_X14_0; \ - VMOVQ_SI_X15(4*8); \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VPINSRQ_1_SI_X15(15*8) +DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) -#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(11*8); \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X13(8*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(3*8) +DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) -#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ - MOVQ 0*8(SI), X12; \ - VPSHUFD $0x4E, 8*8(SI), X13; \ - MOVQ 7*8(SI), X14; \ - MOVQ 2*8(SI), X15; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(11*8) - -// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) -#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ - MOVQ 6*8(SI), X12; \ - MOVQ 11*8(SI), X13; \ - MOVQ 15*8(SI), X14; \ - MOVQ 3*8(SI), X15; \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X14(9*8); \ - VPINSRQ_1_SI_X15(8*8) - -// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) -#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ - MOVQ 5*8(SI), X12; \ - MOVQ 8*8(SI), X13; \ - MOVQ 0*8(SI), X14; \ - MOVQ 6*8(SI), X15; \ - VPINSRQ_1_SI_X12(15*8); \ - VPINSRQ_1_SI_X13(2*8); \ - VPINSRQ_1_SI_X14(4*8); \ - VPINSRQ_1_SI_X15(10*8) - -// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) -#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ - VMOVDQU 12*8(SI), X12; \ - MOVQ 1*8(SI), X13; \ - MOVQ 2*8(SI), X14; \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VMOVDQU 4*8(SI), X15 - -// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) -#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ - MOVQ 15*8(SI), X12; \ - MOVQ 3*8(SI), X13; \ - MOVQ 11*8(SI), X14; \ - MOVQ 12*8(SI), X15; \ - VPINSRQ_1_SI_X12(9*8); \ - VPINSRQ_1_SI_X13(13*8); \ - VPINSRQ_1_SI_X14(14*8); \ - VPINSRQ_1_SI_X15_0 +DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f +DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - VMOVDQU ·AVX_c40<>(SB), X0 - VMOVDQU ·AVX_c48<>(SB), X1 +// Requires: AVX, SSE2 +TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + VMOVDQU ·AVX_c40<>+0(SB), X0 + VMOVDQU ·AVX_c48<>+0(SB), X1 VMOVDQA X0, X8 VMOVDQA X1, X9 - - VMOVDQU ·AVX_iv3<>(SB), X0 - VMOVDQA X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) - - VMOVDQU 0(AX), X10 + VMOVDQU ·AVX_iv3<>+0(SB), X0 + VMOVDQA X0, (R10) + XORQ CX, (R10) + VMOVDQU (AX), X10 VMOVDQU 16(AX), X11 VMOVDQU 32(AX), X2 VMOVDQU 48(AX), X3 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - VMOVQ_R8_X15 - VPINSRQ_1_R9_X15 - + BYTE $0xc4 + BYTE $0x41 + BYTE $0xf9 + BYTE $0x6e + BYTE $0xf8 + BYTE $0xc4 + BYTE $0x43 + BYTE $0x81 + BYTE $0x22 + BYTE $0xf9 + BYTE $0x01 VMOVDQA X10, X0 VMOVDQA X11, X1 - VMOVDQU ·AVX_iv0<>(SB), X4 - VMOVDQU ·AVX_iv1<>(SB), X5 - VMOVDQU ·AVX_iv2<>(SB), X6 - + VMOVDQU ·AVX_iv0<>+0(SB), X4 + VMOVDQU ·AVX_iv1<>+0(SB), X5 + VMOVDQU ·AVX_iv2<>+0(SB), X6 VPXOR X15, X6, X6 - VMOVDQA 0(R10), X7 - - LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA (R10), X7 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 VMOVDQA X12, 16(R10) VMOVDQA X13, 32(R10) VMOVDQA X14, 48(R10) VMOVDQA X15, 64(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 VMOVDQA X12, 80(R10) VMOVDQA X13, 96(R10) VMOVDQA X14, 112(R10) VMOVDQA X15, 128(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 VMOVDQA X12, 144(R10) VMOVDQA X13, 160(R10) VMOVDQA X14, 176(R10) VMOVDQA X15, 192(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPSHUFD $0x4e, (SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 VMOVDQA X12, 208(R10) VMOVDQA X13, 224(R10) VMOVDQA X14, 240(R10) VMOVDQA X15, 256(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_11_12_5_15_8_0_2_13() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_2_5_4_15_6_10_0_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_9_5_2_10_0_7_4_15() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_2_6_0_8_12_10_11_3() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_0_6_9_8_7_3_2_11() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_5_15_8_2_0_4_6_10() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_6_14_11_0_15_9_3_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_12_13_1_10_2_7_4_5() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_15_9_3_13_11_14_12_0() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VMOVDQU 88(SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x36 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ (SI), X12 + VPSHUFD $0x4e, 64(SI), X13 + MOVQ 56(SI), X14 + MOVQ 16(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 40(SI), X12 + MOVQ 64(SI), X13 + MOVQ (SI), X14 + MOVQ 48(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + MOVQ 48(SI), X12 + MOVQ 88(SI), X13 + MOVQ 120(SI), X14 + MOVQ 24(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VMOVDQU 96(SI), X12 + MOVQ 8(SI), X13 + MOVQ 16(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + VMOVDQU 32(SI), X15 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 120(SI), X12 + MOVQ 24(SI), X13 + MOVQ 88(SI), X14 + MOVQ 96(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x3e + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 16(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 32(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 48(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 64(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 80(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 96(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 112(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 128(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 144(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 160(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 176(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 192(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 208(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 224(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 240(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 256(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff VMOVDQU 32(AX), X14 VMOVDQU 48(AX), X15 VPXOR X0, X10, X10 @@ -729,16 +4524,36 @@ noinc: VPXOR X7, X15, X3 VMOVDQU X2, 32(AX) VMOVDQU X3, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + VMOVDQU X10, (AX) + VMOVDQU X11, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VZEROUPPER + RET - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16 - VMOVDQU X10, 0(AX) - VMOVDQU X11, 16(AX) +DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - VZEROUPPER +DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16 - RET +DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16 From 82942cf1d8d34067e576572f2e00014a78c1efd8 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Tue, 23 Jul 2024 16:15:38 -0400 Subject: [PATCH 73/95] blake2b: port blake2b_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="blake2b/blake2b_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I6dd59fb0b0365674aa5e43b69a57ea60fbcc4ba1 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600456 Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda --- blake2b/_asm/standard/blake2b_amd64_asm.go | 361 +++++ blake2b/_asm/standard/go.mod | 15 + blake2b/_asm/standard/go.sum | 12 + blake2b/blake2b_amd64.s | 1681 +++++++++++++++++--- 4 files changed, 1810 insertions(+), 259 deletions(-) create mode 100644 blake2b/_asm/standard/blake2b_amd64_asm.go create mode 100644 blake2b/_asm/standard/go.mod create mode 100644 blake2b/_asm/standard/go.sum diff --git a/blake2b/_asm/standard/blake2b_amd64_asm.go b/blake2b/_asm/standard/blake2b_amd64_asm.go new file mode 100644 index 0000000000..a34db3fca5 --- /dev/null +++ b/blake2b/_asm/standard/blake2b_amd64_asm.go @@ -0,0 +1,361 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/blake2b" +) + +//go:generate go run . -out ../../blake2b_amd64.s -pkg blake2b + +const ThatPeskyUnicodeDot = "\u00b7" + +var iv0_DATA_ptr, iv1_DATA_ptr, iv2_DATA_ptr, iv3_DATA_ptr, c40_DATA_ptr, c48_DATA_ptr *Mem + +func main() { + Package("golang.org/x/crypto/blake2b") + ConstraintExpr("amd64,gc,!purego") + hashBlocksSSE4() + Generate() +} + +func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v6, t1) + PUNPCKLQDQ(v6, t2) + PUNPCKHQDQ(v7, v6) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(v7, t2) + MOVO(t1, v7) + MOVO(v2, t1) + PUNPCKHQDQ(t2, v7) + PUNPCKLQDQ(v3, t2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v3) +} + +func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v2, t1) + PUNPCKLQDQ(v2, t2) + PUNPCKHQDQ(v3, v2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(v3, t2) + MOVO(t1, v3) + MOVO(v6, t1) + PUNPCKHQDQ(t2, v3) + PUNPCKLQDQ(v7, t2) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v7) +} + +func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7 VecPhysical, m0, m1, m2, m3 Op, t0, c40, c48 VecPhysical) { + PADDQ(m0, v0) + PADDQ(m1, v1) + PADDQ(v2, v0) + PADDQ(v3, v1) + PXOR(v0, v6) + PXOR(v1, v7) + PSHUFD(Imm(0xB1), v6, v6) + PSHUFD(Imm(0xB1), v7, v7) + PADDQ(v6, v4) + PADDQ(v7, v5) + PXOR(v4, v2) + PXOR(v5, v3) + PSHUFB(c40, v2) + PSHUFB(c40, v3) + PADDQ(m2, v0) + PADDQ(m3, v1) + PADDQ(v2, v0) + PADDQ(v3, v1) + PXOR(v0, v6) + PXOR(v1, v7) + PSHUFB(c48, v6) + PSHUFB(c48, v7) + PADDQ(v6, v4) + PADDQ(v7, v5) + PXOR(v4, v2) + PXOR(v5, v3) + MOVOU(v2, t0) + PADDQ(v2, t0) + PSRLQ(Imm(63), v2) + PXOR(t0, v2) + MOVOU(v3, t0) + PADDQ(v3, t0) + PSRLQ(Imm(63), v3) + PXOR(t0, v3) +} + +func LOAD_MSG(m0, m1, m2, m3 VecPhysical, src GPPhysical, i0, i1, i2, i3, i4, i5, i6, i7 int) { + MOVQ(Mem{Base: src}.Offset(i0*8), m0) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i1*8), m0) + MOVQ(Mem{Base: src}.Offset(i2*8), m1) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i3*8), m1) + MOVQ(Mem{Base: src}.Offset(i4*8), m2) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i5*8), m2) + MOVQ(Mem{Base: src}.Offset(i6*8), m3) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i7*8), m3) +} + +func hashBlocksSSE4() { + Implement("hashBlocksSSE4") + Attributes(4) + AllocLocal(288) // frame size = 272 + 16 byte alignment + + Load(Param("h"), RAX) + Load(Param("c"), RBX) + Load(Param("flag"), RCX) + Load(Param("blocks").Base(), RSI) + Load(Param("blocks").Len(), RDI) + + MOVQ(RSP, R10) + ADDQ(Imm(15), R10) + ANDQ(I32(-16), R10) + + iv3 := iv3_DATA() + MOVOU(iv3, X0) + MOVO(X0, Mem{Base: R10}.Offset(0)) + XORQ(RCX, Mem{Base: R10}.Offset(0)) // 0(R10) = ·iv3 ^ (CX || 0) + + c40 := c40_DATA() + c48 := c48_DATA() + MOVOU(c40, X13) + MOVOU(c48, X14) + + MOVOU(Mem{Base: AX}.Offset(0), X12) + MOVOU(Mem{Base: AX}.Offset(16), X15) + + MOVQ(Mem{Base: BX}.Offset(0), R8) + MOVQ(Mem{Base: BX}.Offset(8), R9) + + Label("loop") + ADDQ(Imm(128), R8) + CMPQ(R8, Imm(128)) + JGE(LabelRef("noinc")) + INCQ(R9) + + Label("noinc") + MOVQ(R8, X8) + PINSRQ(Imm(1), R9, X8) + + iv0 := iv0_DATA() + iv1 := iv1_DATA() + iv2 := iv2_DATA() + + MOVO(X12, X0) + MOVO(X15, X1) + MOVOU(Mem{Base: AX}.Offset(32), X2) + MOVOU(Mem{Base: AX}.Offset(48), X3) + MOVOU(iv0, X4) + MOVOU(iv1, X5) + MOVOU(iv2, X6) + + PXOR(X8, X6) + MOVO(Mem{Base: R10}.Offset(0), X7) + + LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) + MOVO(X8, Mem{Base: R10}.Offset(16)) + MOVO(X9, Mem{Base: R10}.Offset(32)) + MOVO(X10, Mem{Base: R10}.Offset(48)) + MOVO(X11, Mem{Base: R10}.Offset(64)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) + MOVO(X8, Mem{Base: R10}.Offset(80)) + MOVO(X9, Mem{Base: R10}.Offset(96)) + MOVO(X10, Mem{Base: R10}.Offset(112)) + MOVO(X11, Mem{Base: R10}.Offset(128)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) + MOVO(X8, Mem{Base: R10}.Offset(144)) + MOVO(X9, Mem{Base: R10}.Offset(160)) + MOVO(X10, Mem{Base: R10}.Offset(176)) + MOVO(X11, Mem{Base: R10}.Offset(192)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) + MOVO(X8, Mem{Base: R10}.Offset(208)) + MOVO(X9, Mem{Base: R10}.Offset(224)) + MOVO(X10, Mem{Base: R10}.Offset(240)) + MOVO(X11, Mem{Base: R10}.Offset(256)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(16), Mem{Base: R10}.Offset(32), Mem{Base: R10}.Offset(48), Mem{Base: R10}.Offset(64), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(80), Mem{Base: R10}.Offset(96), Mem{Base: R10}.Offset(112), Mem{Base: R10}.Offset(128), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(144), Mem{Base: R10}.Offset(160), Mem{Base: R10}.Offset(176), Mem{Base: R10}.Offset(192), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(208), Mem{Base: R10}.Offset(224), Mem{Base: R10}.Offset(240), Mem{Base: R10}.Offset(256), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + MOVOU(Mem{Base: AX}.Offset(32), X10) + MOVOU(Mem{Base: AX}.Offset(48), X11) + PXOR(X0, X12) + PXOR(X1, X15) + PXOR(X2, X10) + PXOR(X3, X11) + PXOR(X4, X12) + PXOR(X5, X15) + PXOR(X6, X10) + PXOR(X7, X11) + MOVOU(X10, Mem{Base: AX}.Offset(32)) + MOVOU(X11, Mem{Base: AX}.Offset(48)) + + LEAQ(Mem{Base: SI}.Offset(128), RSI) + SUBQ(Imm(128), RDI) + JNE(LabelRef("loop")) + + MOVOU(X12, Mem{Base: AX}.Offset(0)) + MOVOU(X15, Mem{Base: AX}.Offset(16)) + + MOVQ(R8, Mem{Base: BX}.Offset(0)) + MOVQ(R9, Mem{Base: BX}.Offset(8)) + + RET() +} + +// #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +func iv0_DATA() Mem { + if iv0_DATA_ptr != nil { + return *iv0_DATA_ptr + } + + iv0 := GLOBL(ThatPeskyUnicodeDot+"iv0", NOPTR|RODATA) + iv0_DATA_ptr = &iv0 + DATA(0x00, U64(0x6a09e667f3bcc908)) + DATA(0x08, U64(0xbb67ae8584caa73b)) + return iv0 +} + +func iv1_DATA() Mem { + if iv1_DATA_ptr != nil { + return *iv1_DATA_ptr + } + + iv1 := GLOBL(ThatPeskyUnicodeDot+"iv1", NOPTR|RODATA) + iv1_DATA_ptr = &iv1 + DATA(0x00, U64(0x3c6ef372fe94f82b)) + DATA(0x08, U64(0xa54ff53a5f1d36f1)) + return iv1 +} + +func iv2_DATA() Mem { + if iv2_DATA_ptr != nil { + return *iv2_DATA_ptr + } + + iv2 := GLOBL(ThatPeskyUnicodeDot+"iv2", NOPTR|RODATA) + iv2_DATA_ptr = &iv2 + DATA(0x00, U64(0x510e527fade682d1)) + DATA(0x08, U64(0x9b05688c2b3e6c1f)) + return iv2 +} + +func iv3_DATA() Mem { + if iv3_DATA_ptr != nil { + return *iv3_DATA_ptr + } + + iv3 := GLOBL(ThatPeskyUnicodeDot+"iv3", NOPTR|RODATA) + iv3_DATA_ptr = &iv3 + DATA(0x00, U64(0x1f83d9abfb41bd6b)) + DATA(0x08, U64(0x5be0cd19137e2179)) + return iv3 +} + +func c40_DATA() Mem { + if c40_DATA_ptr != nil { + return *c40_DATA_ptr + } + + c40 := GLOBL(ThatPeskyUnicodeDot+"c40", NOPTR|RODATA) + c40_DATA_ptr = &c40 + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + return c40 +} + +func c48_DATA() Mem { + if c48_DATA_ptr != nil { + return *c48_DATA_ptr + } + + c48 := GLOBL(ThatPeskyUnicodeDot+"c48", NOPTR|RODATA) + c48_DATA_ptr = &c48 + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + return c48 +} diff --git a/blake2b/_asm/standard/go.mod b/blake2b/_asm/standard/go.mod new file mode 100644 index 0000000000..8063f1b9c3 --- /dev/null +++ b/blake2b/_asm/standard/go.mod @@ -0,0 +1,15 @@ +module blake2b/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/blake2b/_asm/standard/go.sum b/blake2b/_asm/standard/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/blake2b/_asm/standard/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/blake2b/blake2b_amd64.s b/blake2b/blake2b_amd64.s index adfac00c15..9a0ce21244 100644 --- a/blake2b/blake2b_amd64.s +++ b/blake2b/blake2b_amd64.s @@ -1,278 +1,1441 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - PADDQ m0, v0; \ - PADDQ m1, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v6, v6; \ - PSHUFD $0xB1, v7, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - PSHUFB c40, v2; \ - PSHUFB c40, v3; \ - PADDQ m2, v0; \ - PADDQ m3, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFB c48, v6; \ - PSHUFB c48, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - MOVOU v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVOU v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ - MOVQ i0*8(src), m0; \ - PINSRQ $1, i1*8(src), m0; \ - MOVQ i2*8(src), m1; \ - PINSRQ $1, i3*8(src), m1; \ - MOVQ i4*8(src), m2; \ - PINSRQ $1, i5*8(src), m2; \ - MOVQ i6*8(src), m3; \ - PINSRQ $1, i7*8(src), m3 - // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - MOVOU ·iv3<>(SB), X0 - MOVO X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) - - MOVOU ·c40<>(SB), X13 - MOVOU ·c48<>(SB), X14 - - MOVOU 0(AX), X12 +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + MOVOU ·iv3<>+0(SB), X0 + MOVO X0, (R10) + XORQ CX, (R10) + MOVOU ·c40<>+0(SB), X13 + MOVOU ·c48<>+0(SB), X14 + MOVOU (AX), X12 MOVOU 16(AX), X15 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - MOVQ R8, X8 - PINSRQ $1, R9, X8 - - MOVO X12, X0 - MOVO X15, X1 - MOVOU 32(AX), X2 - MOVOU 48(AX), X3 - MOVOU ·iv0<>(SB), X4 - MOVOU ·iv1<>(SB), X5 - MOVOU ·iv2<>(SB), X6 - - PXOR X8, X6 - MOVO 0(R10), X7 - - LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) - MOVO X8, 16(R10) - MOVO X9, 32(R10) - MOVO X10, 48(R10) - MOVO X11, 64(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) - MOVO X8, 80(R10) - MOVO X9, 96(R10) - MOVO X10, 112(R10) - MOVO X11, 128(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) - MOVO X8, 144(R10) - MOVO X9, 160(R10) - MOVO X10, 176(R10) - MOVO X11, 192(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) - MOVO X8, 208(R10) - MOVO X9, 224(R10) - MOVO X10, 240(R10) - MOVO X11, 256(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + MOVQ R8, X8 + PINSRQ $0x01, R9, X8 + MOVO X12, X0 + MOVO X15, X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU ·iv0<>+0(SB), X4 + MOVOU ·iv1<>+0(SB), X5 + MOVOU ·iv2<>+0(SB), X6 + PXOR X8, X6 + MOVO (R10), X7 + MOVQ (SI), X8 + PINSRQ $0x01, 16(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 48(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 40(SI), X11 + PINSRQ $0x01, 56(SI), X11 + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 64(SI), X8 + PINSRQ $0x01, 80(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 112(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 88(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 120(SI), X11 + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 112(SI), X8 + PINSRQ $0x01, 32(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 80(SI), X10 + PINSRQ $0x01, 64(SI), X10 + MOVQ 120(SI), X11 + PINSRQ $0x01, 48(SI), X11 + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 8(SI), X8 + PINSRQ $0x01, (SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, 40(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 16(SI), X10 + MOVQ 56(SI), X11 + PINSRQ $0x01, 24(SI), X11 + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 88(SI), X8 + PINSRQ $0x01, 96(SI), X8 + MOVQ 40(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 64(SI), X10 + PINSRQ $0x01, (SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 80(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 72(SI), X9 + MOVQ 112(SI), X10 + PINSRQ $0x01, 48(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 32(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 56(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 104(SI), X9 + PINSRQ $0x01, 88(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 8(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, 112(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 16(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 48(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ (SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 72(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 16(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 120(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 112(SI), X8 + PINSRQ $0x01, 88(SI), X8 + MOVQ 48(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 96(SI), X10 + MOVQ 64(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 16(SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ (SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ 88(SI), X11 + PINSRQ $0x01, 24(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 32(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 120(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 104(SI), X10 + PINSRQ $0x01, 40(SI), X10 + MOVQ 112(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 96(SI), X8 + PINSRQ $0x01, 8(SI), X8 + MOVQ 112(SI), X9 + PINSRQ $0x01, 32(SI), X9 + MOVQ 40(SI), X10 + PINSRQ $0x01, 120(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ (SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 56(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 88(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 104(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 40(SI), X8 + PINSRQ $0x01, 120(SI), X8 + MOVQ 64(SI), X9 + PINSRQ $0x01, 16(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 48(SI), X8 + PINSRQ $0x01, 112(SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, (SI), X9 + MOVQ 120(SI), X10 + PINSRQ $0x01, 72(SI), X10 + MOVQ 24(SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 96(SI), X8 + PINSRQ $0x01, 104(SI), X8 + MOVQ 8(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 80(SI), X8 + PINSRQ $0x01, 64(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 120(SI), X8 + PINSRQ $0x01, 72(SI), X8 + MOVQ 24(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, (SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 16(R10), X0 + PADDQ 32(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 48(R10), X0 + PADDQ 64(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 80(R10), X0 + PADDQ 96(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 112(R10), X0 + PADDQ 128(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 144(R10), X0 + PADDQ 160(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 176(R10), X0 + PADDQ 192(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 208(R10), X0 + PADDQ 224(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 240(R10), X0 + PADDQ 256(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + PXOR X0, X12 + PXOR X1, X15 + PXOR X2, X10 + PXOR X3, X11 + PXOR X4, X12 + PXOR X5, X15 + PXOR X6, X10 + PXOR X7, X11 + MOVOU X10, 32(AX) + MOVOU X11, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVOU X12, (AX) + MOVOU X15, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + RET - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) +DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·iv3<>(SB), RODATA|NOPTR, $16 - MOVOU 32(AX), X10 - MOVOU 48(AX), X11 - PXOR X0, X12 - PXOR X1, X15 - PXOR X2, X10 - PXOR X3, X11 - PXOR X4, X12 - PXOR X5, X15 - PXOR X6, X10 - PXOR X7, X11 - MOVOU X10, 32(AX) - MOVOU X11, 48(AX) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 - MOVOU X12, 0(AX) - MOVOU X15, 16(AX) +DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·iv0<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) +DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·iv1<>(SB), RODATA|NOPTR, $16 - RET +DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·iv2<>(SB), RODATA|NOPTR, $16 From 620dfbc770bb652335dab79ae80f6c9bdb1a7321 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Fri, 26 Jul 2024 02:08:45 -0400 Subject: [PATCH 74/95] salsa20/salsa: Port salsa20_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE":salsa20/salsa/salsa20_amd64.s) \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ salsa20/salsa/salsa20_amd64.s \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: Ica0bb06f8b074ad566a979d33ddc81d8a38491b1 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601217 LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov --- salsa20/salsa/_asm/go.mod | 14 + salsa20/salsa/_asm/go.sum | 10 + salsa20/salsa/_asm/salsa20_amd64_asm.go | 932 ++++++++++++ salsa20/salsa/salsa20_amd64.s | 1742 +++++++++++------------ 4 files changed, 1827 insertions(+), 871 deletions(-) create mode 100644 salsa20/salsa/_asm/go.mod create mode 100644 salsa20/salsa/_asm/go.sum create mode 100644 salsa20/salsa/_asm/salsa20_amd64_asm.go diff --git a/salsa20/salsa/_asm/go.mod b/salsa20/salsa/_asm/go.mod new file mode 100644 index 0000000000..0cf7f76881 --- /dev/null +++ b/salsa20/salsa/_asm/go.mod @@ -0,0 +1,14 @@ +module salsa20/salsa/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/salsa20/salsa/_asm/go.sum b/salsa20/salsa/_asm/go.sum new file mode 100644 index 0000000000..e5970800fb --- /dev/null +++ b/salsa20/salsa/_asm/go.sum @@ -0,0 +1,10 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/salsa20/salsa/_asm/salsa20_amd64_asm.go b/salsa20/salsa/_asm/salsa20_amd64_asm.go new file mode 100644 index 0000000000..6546791c4c --- /dev/null +++ b/salsa20/salsa/_asm/salsa20_amd64_asm.go @@ -0,0 +1,932 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This code was translated into a form compatible with 6a from the public +// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/salsa20/salsa" +) + +//go:generate go run . -out ../salsa20_amd64.s -pkg salsa + +func main() { + Package("golang.org/x/crypto/salsa20/salsa") + ConstraintExpr("amd64,!purego,gc") + salsa2020XORKeyStream() + Generate() +} + +func salsa2020XORKeyStream() { + Implement("salsa2020XORKeyStream") + Attributes(0) + AllocLocal(456) // frame = 424 + 32 byte alignment + Comment("This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.") + + Load(Param("out"), RDI) + Load(Param("in"), RSI) + Load(Param("n"), RDX) + Load(Param("nonce"), RCX) + Load(Param("key"), R8) + + MOVQ(RSP, R12) + ADDQ(Imm(31), R12) + ANDQ(I32(^31), R12) + + MOVQ(RDX, R9) + MOVQ(RCX, RDX) + MOVQ(R8, R10) + CMPQ(R9, Imm(0)) + JBE(LabelRef("DONE")) + + START() + BYTESATLEAST256() + MAINLOOP1() + BYTESBETWEEN1AND255() + NOCOPY() + MAINLOOP2() + + Label("BYTESATLEAST64") + Label("DONE") + RET() + Label("BYTESATLEAST65") + SUBQ(Imm(64), R9) + ADDQ(Imm(64), RDI) + ADDQ(Imm(64), RSI) + JMP(LabelRef("BYTESBETWEEN1AND255")) +} + +func START() { + Label("START") + MOVL(Mem{Base: R10}.Offset(20), ECX) + MOVL(Mem{Base: R10}.Offset(0), R8L) + MOVL(Mem{Base: EDX}.Offset(0), EAX) + MOVL(Mem{Base: R10}.Offset(16), R11L) + MOVL(ECX, Mem{Base: R12}.Offset(0)) + MOVL(R8L, Mem{Base: R12}.Offset(4)) + MOVL(EAX, Mem{Base: R12}.Offset(8)) + MOVL(R11L, Mem{Base: R12}.Offset(12)) + MOVL(Mem{Base: EDX}.Offset(8), ECX) + MOVL(Mem{Base: R10}.Offset(24), R8L) + MOVL(Mem{Base: R10}.Offset(4), EAX) + MOVL(Mem{Base: EDX}.Offset(4), R11L) + MOVL(ECX, Mem{Base: R12}.Offset(16)) + MOVL(R8L, Mem{Base: R12}.Offset(20)) + MOVL(EAX, Mem{Base: R12}.Offset(24)) + MOVL(R11L, Mem{Base: R12}.Offset(28)) + MOVL(Mem{Base: EDX}.Offset(12), ECX) + MOVL(Mem{Base: R10}.Offset(12), EDX) + MOVL(Mem{Base: R10}.Offset(28), R8L) + MOVL(Mem{Base: R10}.Offset(8), EAX) + MOVL(EDX, Mem{Base: R12}.Offset(32)) + MOVL(ECX, Mem{Base: R12}.Offset(36)) + MOVL(R8L, Mem{Base: R12}.Offset(40)) + MOVL(EAX, Mem{Base: R12}.Offset(44)) + MOVQ(Imm(1634760805), RDX) + MOVQ(Imm(857760878), RCX) + MOVQ(Imm(2036477234), R8) + MOVQ(Imm(1797285236), RAX) + MOVL(EDX, Mem{Base: R12}.Offset(48)) + MOVL(ECX, Mem{Base: R12}.Offset(52)) + MOVL(R8L, Mem{Base: R12}.Offset(56)) + MOVL(EAX, Mem{Base: R12}.Offset(60)) + CMPQ(R9, U32(256)) + JB(LabelRef("BYTESBETWEEN1AND255")) + MOVOA(Mem{Base: R12}.Offset(48), X0) + PSHUFL(Imm(0x55), X0, X1) + PSHUFL(Imm(0xAA), X0, X2) + PSHUFL(Imm(0xFF), X0, X3) + PSHUFL(Imm(0x00), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(64)) + MOVOA(X2, Mem{Base: R12}.Offset(80)) + MOVOA(X3, Mem{Base: R12}.Offset(96)) + MOVOA(X0, Mem{Base: R12}.Offset(112)) + MOVOA(Mem{Base: R12}.Offset(0), X0) + PSHUFL(Imm(0xAA), X0, X1) + PSHUFL(Imm(0xFF), X0, X2) + PSHUFL(Imm(0x00), X0, X3) + PSHUFL(Imm(0x55), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(128)) + MOVOA(X2, Mem{Base: R12}.Offset(144)) + MOVOA(X3, Mem{Base: R12}.Offset(160)) + MOVOA(X0, Mem{Base: R12}.Offset(176)) + MOVOA(Mem{Base: R12}.Offset(16), X0) + PSHUFL(Imm(0xFF), X0, X1) + PSHUFL(Imm(0x55), X0, X2) + PSHUFL(Imm(0xAA), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(192)) + MOVOA(X2, Mem{Base: R12}.Offset(208)) + MOVOA(X0, Mem{Base: R12}.Offset(224)) + MOVOA(Mem{Base: R12}.Offset(32), X0) + PSHUFL(Imm(0x00), X0, X1) + PSHUFL(Imm(0xAA), X0, X2) + PSHUFL(Imm(0xFF), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(240)) + MOVOA(X2, Mem{Base: R12}.Offset(256)) + MOVOA(X0, Mem{Base: R12}.Offset(272)) + +} + +func BYTESATLEAST256() { + Label("BYTESATLEAST256") + MOVL(Mem{Base: R12}.Offset(16), EDX) + MOVL(Mem{Base: R12}.Offset(36), ECX) + MOVL(EDX, Mem{Base: R12}.Offset(288)) + MOVL(ECX, Mem{Base: R12}.Offset(304)) + SHLQ(Imm(32), RCX) + ADDQ(RCX, RDX) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(292)) + MOVL(ECX, Mem{Base: R12}.Offset(308)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(296)) + MOVL(ECX, Mem{Base: R12}.Offset(312)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(300)) + MOVL(ECX, Mem{Base: R12}.Offset(316)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(16)) + MOVL(ECX, Mem{Base: R12}.Offset(36)) + MOVQ(R9, Mem{Base: R12}.Offset(352)) + MOVQ(U32(20), RDX) + MOVOA(Mem{Base: R12}.Offset(64), X0) + MOVOA(Mem{Base: R12}.Offset(80), X1) + MOVOA(Mem{Base: R12}.Offset(96), X2) + MOVOA(Mem{Base: R12}.Offset(256), X3) + MOVOA(Mem{Base: R12}.Offset(272), X4) + MOVOA(Mem{Base: R12}.Offset(128), X5) + MOVOA(Mem{Base: R12}.Offset(144), X6) + MOVOA(Mem{Base: R12}.Offset(176), X7) + MOVOA(Mem{Base: R12}.Offset(192), X8) + MOVOA(Mem{Base: R12}.Offset(208), X9) + MOVOA(Mem{Base: R12}.Offset(224), X10) + MOVOA(Mem{Base: R12}.Offset(304), X11) + MOVOA(Mem{Base: R12}.Offset(112), X12) + MOVOA(Mem{Base: R12}.Offset(160), X13) + MOVOA(Mem{Base: R12}.Offset(240), X14) + MOVOA(Mem{Base: R12}.Offset(288), X15) +} + +func MAINLOOP1() { + Label("MAINLOOP1") + MOVOA(X1, Mem{Base: R12}.Offset(320)) + MOVOA(X2, Mem{Base: R12}.Offset(336)) + MOVOA(X13, X1) + PADDL(X12, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X14) + PSRLL(Imm(25), X2) + PXOR(X2, X14) + MOVOA(X7, X1) + PADDL(X0, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X11) + PSRLL(Imm(25), X2) + PXOR(X2, X11) + MOVOA(X12, X1) + PADDL(X14, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X15) + PSRLL(Imm(23), X2) + PXOR(X2, X15) + MOVOA(X0, X1) + PADDL(X11, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X9) + PSRLL(Imm(23), X2) + PXOR(X2, X9) + MOVOA(X14, X1) + PADDL(X15, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X13) + PSRLL(Imm(19), X2) + PXOR(X2, X13) + MOVOA(X11, X1) + PADDL(X9, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X7) + PSRLL(Imm(19), X2) + PXOR(X2, X7) + MOVOA(X15, X1) + PADDL(X13, X1) + MOVOA(X1, X2) + PSLLL(Imm(18), X1) + PXOR(X1, X12) + PSRLL(Imm(14), X2) + PXOR(X2, X12) + MOVOA(Mem{Base: R12}.Offset(320), X1) + MOVOA(X12, Mem{Base: R12}.Offset(320)) + MOVOA(X9, X2) + PADDL(X7, X2) + MOVOA(X2, X12) + PSLLL(Imm(18), X2) + PXOR(X2, X0) + PSRLL(Imm(14), X12) + PXOR(X12, X0) + MOVOA(X5, X2) + PADDL(X1, X2) + MOVOA(X2, X12) + PSLLL(Imm(7), X2) + PXOR(X2, X3) + PSRLL(Imm(25), X12) + PXOR(X12, X3) + MOVOA(Mem{Base: R12}.Offset(336), X2) + MOVOA(X0, Mem{Base: R12}.Offset(336)) + MOVOA(X6, X0) + PADDL(X2, X0) + MOVOA(X0, X12) + PSLLL(Imm(7), X0) + PXOR(X0, X4) + PSRLL(Imm(25), X12) + PXOR(X12, X4) + MOVOA(X1, X0) + PADDL(X3, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X10) + PSRLL(Imm(23), X12) + PXOR(X12, X10) + MOVOA(X2, X0) + PADDL(X4, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X8) + PSRLL(Imm(23), X12) + PXOR(X12, X8) + MOVOA(X3, X0) + PADDL(X10, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X5) + PSRLL(Imm(19), X12) + PXOR(X12, X5) + MOVOA(X4, X0) + PADDL(X8, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X6) + PSRLL(Imm(19), X12) + PXOR(X12, X6) + MOVOA(X10, X0) + PADDL(X5, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X1) + PSRLL(Imm(14), X12) + PXOR(X12, X1) + MOVOA(Mem{Base: R12}.Offset(320), X0) + MOVOA(X1, Mem{Base: R12}.Offset(320)) + MOVOA(X4, X1) + PADDL(X0, X1) + MOVOA(X1, X12) + PSLLL(Imm(7), X1) + PXOR(X1, X7) + PSRLL(Imm(25), X12) + PXOR(X12, X7) + MOVOA(X8, X1) + PADDL(X6, X1) + MOVOA(X1, X12) + PSLLL(Imm(18), X1) + PXOR(X1, X2) + PSRLL(Imm(14), X12) + PXOR(X12, X2) + MOVOA(Mem{Base: R12}.Offset(336), X12) + MOVOA(X2, Mem{Base: R12}.Offset(336)) + MOVOA(X14, X1) + PADDL(X12, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X5) + PSRLL(Imm(25), X2) + PXOR(X2, X5) + MOVOA(X0, X1) + PADDL(X7, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X10) + PSRLL(Imm(23), X2) + PXOR(X2, X10) + MOVOA(X12, X1) + PADDL(X5, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X8) + PSRLL(Imm(23), X2) + PXOR(X2, X8) + MOVOA(X7, X1) + PADDL(X10, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X4) + PSRLL(Imm(19), X2) + PXOR(X2, X4) + MOVOA(X5, X1) + PADDL(X8, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X14) + PSRLL(Imm(19), X2) + PXOR(X2, X14) + MOVOA(X10, X1) + PADDL(X4, X1) + MOVOA(X1, X2) + PSLLL(Imm(18), X1) + PXOR(X1, X0) + PSRLL(Imm(14), X2) + PXOR(X2, X0) + MOVOA(Mem{Base: R12}.Offset(320), X1) + MOVOA(X0, Mem{Base: R12}.Offset(320)) + MOVOA(X8, X0) + PADDL(X14, X0) + MOVOA(X0, X2) + PSLLL(Imm(18), X0) + PXOR(X0, X12) + PSRLL(Imm(14), X2) + PXOR(X2, X12) + MOVOA(X11, X0) + PADDL(X1, X0) + MOVOA(X0, X2) + PSLLL(Imm(7), X0) + PXOR(X0, X6) + PSRLL(Imm(25), X2) + PXOR(X2, X6) + MOVOA(Mem{Base: R12}.Offset(336), X2) + MOVOA(X12, Mem{Base: R12}.Offset(336)) + MOVOA(X3, X0) + PADDL(X2, X0) + MOVOA(X0, X12) + PSLLL(Imm(7), X0) + PXOR(X0, X13) + PSRLL(Imm(25), X12) + PXOR(X12, X13) + MOVOA(X1, X0) + PADDL(X6, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X15) + PSRLL(Imm(23), X12) + PXOR(X12, X15) + MOVOA(X2, X0) + PADDL(X13, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X9) + PSRLL(Imm(23), X12) + PXOR(X12, X9) + MOVOA(X6, X0) + PADDL(X15, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X11) + PSRLL(Imm(19), X12) + PXOR(X12, X11) + MOVOA(X13, X0) + PADDL(X9, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X3) + PSRLL(Imm(19), X12) + PXOR(X12, X3) + MOVOA(X15, X0) + PADDL(X11, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X1) + PSRLL(Imm(14), X12) + PXOR(X12, X1) + MOVOA(X9, X0) + PADDL(X3, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X2) + PSRLL(Imm(14), X12) + PXOR(X12, X2) + MOVOA(Mem{Base: R12}.Offset(320), X12) + MOVOA(Mem{Base: R12}.Offset(336), X0) + SUBQ(Imm(2), RDX) + JA(LabelRef("MAINLOOP1")) + PADDL(Mem{Base: R12}.Offset(112), X12) + PADDL(Mem{Base: R12}.Offset(176), X7) + PADDL(Mem{Base: R12}.Offset(224), X10) + PADDL(Mem{Base: R12}.Offset(272), X4) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(0), EDX) + XORL(Mem{Base: SI}.Offset(4), ECX) + XORL(Mem{Base: SI}.Offset(8), R8L) + XORL(Mem{Base: SI}.Offset(12), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(0)) + MOVL(ECX, Mem{Base: DI}.Offset(4)) + MOVL(R8L, Mem{Base: DI}.Offset(8)) + MOVL(R9L, Mem{Base: DI}.Offset(12)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(64), EDX) + XORL(Mem{Base: SI}.Offset(68), ECX) + XORL(Mem{Base: SI}.Offset(72), R8L) + XORL(Mem{Base: SI}.Offset(76), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(64)) + MOVL(ECX, Mem{Base: DI}.Offset(68)) + MOVL(R8L, Mem{Base: DI}.Offset(72)) + MOVL(R9L, Mem{Base: DI}.Offset(76)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(128), EDX) + XORL(Mem{Base: SI}.Offset(132), ECX) + XORL(Mem{Base: SI}.Offset(136), R8L) + XORL(Mem{Base: SI}.Offset(140), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(128)) + MOVL(ECX, Mem{Base: DI}.Offset(132)) + MOVL(R8L, Mem{Base: DI}.Offset(136)) + MOVL(R9L, Mem{Base: DI}.Offset(140)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + XORL(Mem{Base: SI}.Offset(192), EDX) + XORL(Mem{Base: SI}.Offset(196), ECX) + XORL(Mem{Base: SI}.Offset(200), R8L) + XORL(Mem{Base: SI}.Offset(204), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(192)) + MOVL(ECX, Mem{Base: DI}.Offset(196)) + MOVL(R8L, Mem{Base: DI}.Offset(200)) + MOVL(R9L, Mem{Base: DI}.Offset(204)) + PADDL(Mem{Base: R12}.Offset(240), X14) + PADDL(Mem{Base: R12}.Offset(64), X0) + PADDL(Mem{Base: R12}.Offset(128), X5) + PADDL(Mem{Base: R12}.Offset(192), X8) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(16), EDX) + XORL(Mem{Base: SI}.Offset(20), ECX) + XORL(Mem{Base: SI}.Offset(24), R8L) + XORL(Mem{Base: SI}.Offset(28), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(16)) + MOVL(ECX, Mem{Base: DI}.Offset(20)) + MOVL(R8L, Mem{Base: DI}.Offset(24)) + MOVL(R9L, Mem{Base: DI}.Offset(28)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(80), EDX) + XORL(Mem{Base: SI}.Offset(84), ECX) + XORL(Mem{Base: SI}.Offset(88), R8L) + XORL(Mem{Base: SI}.Offset(92), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(80)) + MOVL(ECX, Mem{Base: DI}.Offset(84)) + MOVL(R8L, Mem{Base: DI}.Offset(88)) + MOVL(R9L, Mem{Base: DI}.Offset(92)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(144), EDX) + XORL(Mem{Base: SI}.Offset(148), ECX) + XORL(Mem{Base: SI}.Offset(152), R8L) + XORL(Mem{Base: SI}.Offset(156), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(144)) + MOVL(ECX, Mem{Base: DI}.Offset(148)) + MOVL(R8L, Mem{Base: DI}.Offset(152)) + MOVL(R9L, Mem{Base: DI}.Offset(156)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + XORL(Mem{Base: SI}.Offset(208), EDX) + XORL(Mem{Base: SI}.Offset(212), ECX) + XORL(Mem{Base: SI}.Offset(216), R8L) + XORL(Mem{Base: SI}.Offset(220), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(208)) + MOVL(ECX, Mem{Base: DI}.Offset(212)) + MOVL(R8L, Mem{Base: DI}.Offset(216)) + MOVL(R9L, Mem{Base: DI}.Offset(220)) + PADDL(Mem{Base: R12}.Offset(288), X15) + PADDL(Mem{Base: R12}.Offset(304), X11) + PADDL(Mem{Base: R12}.Offset(80), X1) + PADDL(Mem{Base: R12}.Offset(144), X6) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(32), EDX) + XORL(Mem{Base: SI}.Offset(36), ECX) + XORL(Mem{Base: SI}.Offset(40), R8L) + XORL(Mem{Base: SI}.Offset(44), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(32)) + MOVL(ECX, Mem{Base: DI}.Offset(36)) + MOVL(R8L, Mem{Base: DI}.Offset(40)) + MOVL(R9L, Mem{Base: DI}.Offset(44)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(96), EDX) + XORL(Mem{Base: SI}.Offset(100), ECX) + XORL(Mem{Base: SI}.Offset(104), R8L) + XORL(Mem{Base: SI}.Offset(108), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(96)) + MOVL(ECX, Mem{Base: DI}.Offset(100)) + MOVL(R8L, Mem{Base: DI}.Offset(104)) + MOVL(R9L, Mem{Base: DI}.Offset(108)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(160), EDX) + XORL(Mem{Base: SI}.Offset(164), ECX) + XORL(Mem{Base: SI}.Offset(168), R8L) + XORL(Mem{Base: SI}.Offset(172), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(160)) + MOVL(ECX, Mem{Base: DI}.Offset(164)) + MOVL(R8L, Mem{Base: DI}.Offset(168)) + MOVL(R9L, Mem{Base: DI}.Offset(172)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + XORL(Mem{Base: SI}.Offset(224), EDX) + XORL(Mem{Base: SI}.Offset(228), ECX) + XORL(Mem{Base: SI}.Offset(232), R8L) + XORL(Mem{Base: SI}.Offset(236), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(224)) + MOVL(ECX, Mem{Base: DI}.Offset(228)) + MOVL(R8L, Mem{Base: DI}.Offset(232)) + MOVL(R9L, Mem{Base: DI}.Offset(236)) + PADDL(Mem{Base: R12}.Offset(160), X13) + PADDL(Mem{Base: R12}.Offset(208), X9) + PADDL(Mem{Base: R12}.Offset(256), X3) + PADDL(Mem{Base: R12}.Offset(96), X2) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(48), EDX) + XORL(Mem{Base: SI}.Offset(52), ECX) + XORL(Mem{Base: SI}.Offset(56), R8L) + XORL(Mem{Base: SI}.Offset(60), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(48)) + MOVL(ECX, Mem{Base: DI}.Offset(52)) + MOVL(R8L, Mem{Base: DI}.Offset(56)) + MOVL(R9L, Mem{Base: DI}.Offset(60)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(112), EDX) + XORL(Mem{Base: SI}.Offset(116), ECX) + XORL(Mem{Base: SI}.Offset(120), R8L) + XORL(Mem{Base: SI}.Offset(124), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(112)) + MOVL(ECX, Mem{Base: DI}.Offset(116)) + MOVL(R8L, Mem{Base: DI}.Offset(120)) + MOVL(R9L, Mem{Base: DI}.Offset(124)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(176), EDX) + XORL(Mem{Base: SI}.Offset(180), ECX) + XORL(Mem{Base: SI}.Offset(184), R8L) + XORL(Mem{Base: SI}.Offset(188), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(176)) + MOVL(ECX, Mem{Base: DI}.Offset(180)) + MOVL(R8L, Mem{Base: DI}.Offset(184)) + MOVL(R9L, Mem{Base: DI}.Offset(188)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + XORL(Mem{Base: SI}.Offset(240), EDX) + XORL(Mem{Base: SI}.Offset(244), ECX) + XORL(Mem{Base: SI}.Offset(248), R8L) + XORL(Mem{Base: SI}.Offset(252), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(240)) + MOVL(ECX, Mem{Base: DI}.Offset(244)) + MOVL(R8L, Mem{Base: DI}.Offset(248)) + MOVL(R9L, Mem{Base: DI}.Offset(252)) + MOVQ(Mem{Base: R12}.Offset(352), R9) + SUBQ(U32(256), R9) + ADDQ(U32(256), RSI) + ADDQ(U32(256), RDI) + CMPQ(R9, U32(256)) + JAE(LabelRef("BYTESATLEAST256")) + CMPQ(R9, Imm(0)) + JBE(LabelRef("DONE")) +} + +func BYTESBETWEEN1AND255() { + Label("BYTESBETWEEN1AND255") + CMPQ(R9, Imm(64)) + JAE(LabelRef("NOCOPY")) + MOVQ(RDI, RDX) + LEAQ(Mem{Base: R12}.Offset(360), RDI) + MOVQ(R9, RCX) + // Hack to get Avo to emit: + // REP; MOVSB + Instruction(&ir.Instruction{Opcode: "REP; MOVSB"}) + LEAQ(Mem{Base: R12}.Offset(360), RDI) + LEAQ(Mem{Base: R12}.Offset(360), RSI) +} + +func NOCOPY() { + Label("NOCOPY") + MOVQ(R9, Mem{Base: R12}.Offset(352)) + MOVOA(Mem{Base: R12}.Offset(48), X0) + MOVOA(Mem{Base: R12}.Offset(0), X1) + MOVOA(Mem{Base: R12}.Offset(16), X2) + MOVOA(Mem{Base: R12}.Offset(32), X3) + MOVOA(X1, X4) + MOVQ(U32(20), RCX) +} + +func MAINLOOP2() { + Label("MAINLOOP2") + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X3) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X3, X3) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X1) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X1, X1) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X1) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X1, X1) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X3) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X3, X3) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X3) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X3, X3) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X1) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X1, X1) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X1) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X1, X1) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X3) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X3) + SUBQ(Imm(4), RCX) + PADDL(X3, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PXOR(X7, X7) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X3, X3) + PXOR(X6, X0) + JA(LabelRef("MAINLOOP2")) + PADDL(Mem{Base: R12}.Offset(48), X0) + PADDL(Mem{Base: R12}.Offset(0), X1) + PADDL(Mem{Base: R12}.Offset(16), X2) + PADDL(Mem{Base: R12}.Offset(32), X3) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(0), ECX) + XORL(Mem{Base: SI}.Offset(48), R8L) + XORL(Mem{Base: SI}.Offset(32), R9L) + XORL(Mem{Base: SI}.Offset(16), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(0)) + MOVL(R8L, Mem{Base: DI}.Offset(48)) + MOVL(R9L, Mem{Base: DI}.Offset(32)) + MOVL(EAX, Mem{Base: DI}.Offset(16)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(20), ECX) + XORL(Mem{Base: SI}.Offset(4), R8L) + XORL(Mem{Base: SI}.Offset(52), R9L) + XORL(Mem{Base: SI}.Offset(36), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(20)) + MOVL(R8L, Mem{Base: DI}.Offset(4)) + MOVL(R9L, Mem{Base: DI}.Offset(52)) + MOVL(EAX, Mem{Base: DI}.Offset(36)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(40), ECX) + XORL(Mem{Base: SI}.Offset(24), R8L) + XORL(Mem{Base: SI}.Offset(8), R9L) + XORL(Mem{Base: SI}.Offset(56), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(40)) + MOVL(R8L, Mem{Base: DI}.Offset(24)) + MOVL(R9L, Mem{Base: DI}.Offset(8)) + MOVL(EAX, Mem{Base: DI}.Offset(56)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + XORL(Mem{Base: SI}.Offset(60), ECX) + XORL(Mem{Base: SI}.Offset(44), R8L) + XORL(Mem{Base: SI}.Offset(28), R9L) + XORL(Mem{Base: SI}.Offset(12), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(60)) + MOVL(R8L, Mem{Base: DI}.Offset(44)) + MOVL(R9L, Mem{Base: DI}.Offset(28)) + MOVL(EAX, Mem{Base: DI}.Offset(12)) + MOVQ(Mem{Base: R12}.Offset(352), R9) + MOVL(Mem{Base: R12}.Offset(16), ECX) + MOVL(Mem{Base: R12}.Offset(36), R8L) + ADDQ(Imm(1), RCX) + SHLQ(Imm(32), R8) + ADDQ(R8, RCX) + MOVQ(RCX, R8) + SHRQ(Imm(32), R8) + MOVL(ECX, Mem{Base: R12}.Offset(16)) + MOVL(R8L, Mem{Base: R12}.Offset(36)) + CMPQ(R9, Imm(64)) + JA(LabelRef("BYTESATLEAST65")) + JAE(LabelRef("BYTESATLEAST64")) + MOVQ(RDI, RSI) + MOVQ(RDX, RDI) + MOVQ(R9, RCX) + // Hack to get Avo to emit: + // REP; MOVSB + Instruction(&ir.Instruction{Opcode: "REP; MOVSB"}) +} diff --git a/salsa20/salsa/salsa20_amd64.s b/salsa20/salsa/salsa20_amd64.s index fcce0234b6..3883e0ec22 100644 --- a/salsa20/salsa/salsa20_amd64.s +++ b/salsa20/salsa/salsa20_amd64.s @@ -1,880 +1,880 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT. //go:build amd64 && !purego && gc -// This code was translated into a form compatible with 6a from the public -// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html +// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte) +// Requires: SSE2 +TEXT ·salsa2020XORKeyStream(SB), $456-40 + // This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. + MOVQ out+0(FP), DI + MOVQ in+8(FP), SI + MOVQ n+16(FP), DX + MOVQ nonce+24(FP), CX + MOVQ key+32(FP), R8 + MOVQ SP, R12 + ADDQ $0x1f, R12 + ANDQ $-32, R12 + MOVQ DX, R9 + MOVQ CX, DX + MOVQ R8, R10 + CMPQ R9, $0x00 + JBE DONE + MOVL 20(R10), CX + MOVL (R10), R8 + MOVL (DX), AX + MOVL 16(R10), R11 + MOVL CX, (R12) + MOVL R8, 4(R12) + MOVL AX, 8(R12) + MOVL R11, 12(R12) + MOVL 8(DX), CX + MOVL 24(R10), R8 + MOVL 4(R10), AX + MOVL 4(DX), R11 + MOVL CX, 16(R12) + MOVL R8, 20(R12) + MOVL AX, 24(R12) + MOVL R11, 28(R12) + MOVL 12(DX), CX + MOVL 12(R10), DX + MOVL 28(R10), R8 + MOVL 8(R10), AX + MOVL DX, 32(R12) + MOVL CX, 36(R12) + MOVL R8, 40(R12) + MOVL AX, 44(R12) + MOVQ $0x61707865, DX + MOVQ $0x3320646e, CX + MOVQ $0x79622d32, R8 + MOVQ $0x6b206574, AX + MOVL DX, 48(R12) + MOVL CX, 52(R12) + MOVL R8, 56(R12) + MOVL AX, 60(R12) + CMPQ R9, $0x00000100 + JB BYTESBETWEEN1AND255 + MOVOA 48(R12), X0 + PSHUFL $0x55, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X3 + PSHUFL $0x00, X0, X0 + MOVOA X1, 64(R12) + MOVOA X2, 80(R12) + MOVOA X3, 96(R12) + MOVOA X0, 112(R12) + MOVOA (R12), X0 + PSHUFL $0xaa, X0, X1 + PSHUFL $0xff, X0, X2 + PSHUFL $0x00, X0, X3 + PSHUFL $0x55, X0, X0 + MOVOA X1, 128(R12) + MOVOA X2, 144(R12) + MOVOA X3, 160(R12) + MOVOA X0, 176(R12) + MOVOA 16(R12), X0 + PSHUFL $0xff, X0, X1 + PSHUFL $0x55, X0, X2 + PSHUFL $0xaa, X0, X0 + MOVOA X1, 192(R12) + MOVOA X2, 208(R12) + MOVOA X0, 224(R12) + MOVOA 32(R12), X0 + PSHUFL $0x00, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X0 + MOVOA X1, 240(R12) + MOVOA X2, 256(R12) + MOVOA X0, 272(R12) -// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte) -// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. -TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment - MOVQ out+0(FP),DI - MOVQ in+8(FP),SI - MOVQ n+16(FP),DX - MOVQ nonce+24(FP),CX - MOVQ key+32(FP),R8 +BYTESATLEAST256: + MOVL 16(R12), DX + MOVL 36(R12), CX + MOVL DX, 288(R12) + MOVL CX, 304(R12) + SHLQ $0x20, CX + ADDQ CX, DX + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 292(R12) + MOVL CX, 308(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 296(R12) + MOVL CX, 312(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 300(R12) + MOVL CX, 316(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 16(R12) + MOVL CX, 36(R12) + MOVQ R9, 352(R12) + MOVQ $0x00000014, DX + MOVOA 64(R12), X0 + MOVOA 80(R12), X1 + MOVOA 96(R12), X2 + MOVOA 256(R12), X3 + MOVOA 272(R12), X4 + MOVOA 128(R12), X5 + MOVOA 144(R12), X6 + MOVOA 176(R12), X7 + MOVOA 192(R12), X8 + MOVOA 208(R12), X9 + MOVOA 224(R12), X10 + MOVOA 304(R12), X11 + MOVOA 112(R12), X12 + MOVOA 160(R12), X13 + MOVOA 240(R12), X14 + MOVOA 288(R12), X15 - MOVQ SP,R12 - ADDQ $31, R12 - ANDQ $~31, R12 +MAINLOOP1: + MOVOA X1, 320(R12) + MOVOA X2, 336(R12) + MOVOA X13, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X14 + PSRLL $0x19, X2 + PXOR X2, X14 + MOVOA X7, X1 + PADDL X0, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X11 + PSRLL $0x19, X2 + PXOR X2, X11 + MOVOA X12, X1 + PADDL X14, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X15 + PSRLL $0x17, X2 + PXOR X2, X15 + MOVOA X0, X1 + PADDL X11, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X9 + PSRLL $0x17, X2 + PXOR X2, X9 + MOVOA X14, X1 + PADDL X15, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X13 + PSRLL $0x13, X2 + PXOR X2, X13 + MOVOA X11, X1 + PADDL X9, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X7 + PSRLL $0x13, X2 + PXOR X2, X7 + MOVOA X15, X1 + PADDL X13, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA 320(R12), X1 + MOVOA X12, 320(R12) + MOVOA X9, X2 + PADDL X7, X2 + MOVOA X2, X12 + PSLLL $0x12, X2 + PXOR X2, X0 + PSRLL $0x0e, X12 + PXOR X12, X0 + MOVOA X5, X2 + PADDL X1, X2 + MOVOA X2, X12 + PSLLL $0x07, X2 + PXOR X2, X3 + PSRLL $0x19, X12 + PXOR X12, X3 + MOVOA 336(R12), X2 + MOVOA X0, 336(R12) + MOVOA X6, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X4 + PSRLL $0x19, X12 + PXOR X12, X4 + MOVOA X1, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X10 + PSRLL $0x17, X12 + PXOR X12, X10 + MOVOA X2, X0 + PADDL X4, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X8 + PSRLL $0x17, X12 + PXOR X12, X8 + MOVOA X3, X0 + PADDL X10, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X5 + PSRLL $0x13, X12 + PXOR X12, X5 + MOVOA X4, X0 + PADDL X8, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X6 + PSRLL $0x13, X12 + PXOR X12, X6 + MOVOA X10, X0 + PADDL X5, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA 320(R12), X0 + MOVOA X1, 320(R12) + MOVOA X4, X1 + PADDL X0, X1 + MOVOA X1, X12 + PSLLL $0x07, X1 + PXOR X1, X7 + PSRLL $0x19, X12 + PXOR X12, X7 + MOVOA X8, X1 + PADDL X6, X1 + MOVOA X1, X12 + PSLLL $0x12, X1 + PXOR X1, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 336(R12), X12 + MOVOA X2, 336(R12) + MOVOA X14, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X5 + PSRLL $0x19, X2 + PXOR X2, X5 + MOVOA X0, X1 + PADDL X7, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X10 + PSRLL $0x17, X2 + PXOR X2, X10 + MOVOA X12, X1 + PADDL X5, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X8 + PSRLL $0x17, X2 + PXOR X2, X8 + MOVOA X7, X1 + PADDL X10, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X4 + PSRLL $0x13, X2 + PXOR X2, X4 + MOVOA X5, X1 + PADDL X8, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X14 + PSRLL $0x13, X2 + PXOR X2, X14 + MOVOA X10, X1 + PADDL X4, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X0 + PSRLL $0x0e, X2 + PXOR X2, X0 + MOVOA 320(R12), X1 + MOVOA X0, 320(R12) + MOVOA X8, X0 + PADDL X14, X0 + MOVOA X0, X2 + PSLLL $0x12, X0 + PXOR X0, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA X11, X0 + PADDL X1, X0 + MOVOA X0, X2 + PSLLL $0x07, X0 + PXOR X0, X6 + PSRLL $0x19, X2 + PXOR X2, X6 + MOVOA 336(R12), X2 + MOVOA X12, 336(R12) + MOVOA X3, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X13 + PSRLL $0x19, X12 + PXOR X12, X13 + MOVOA X1, X0 + PADDL X6, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X15 + PSRLL $0x17, X12 + PXOR X12, X15 + MOVOA X2, X0 + PADDL X13, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X9 + PSRLL $0x17, X12 + PXOR X12, X9 + MOVOA X6, X0 + PADDL X15, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X11 + PSRLL $0x13, X12 + PXOR X12, X11 + MOVOA X13, X0 + PADDL X9, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X3 + PSRLL $0x13, X12 + PXOR X12, X3 + MOVOA X15, X0 + PADDL X11, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA X9, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 320(R12), X12 + MOVOA 336(R12), X0 + SUBQ $0x02, DX + JA MAINLOOP1 + PADDL 112(R12), X12 + PADDL 176(R12), X7 + PADDL 224(R12), X10 + PADDL 272(R12), X4 + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL (SI), DX + XORL 4(SI), CX + XORL 8(SI), R8 + XORL 12(SI), R9 + MOVL DX, (DI) + MOVL CX, 4(DI) + MOVL R8, 8(DI) + MOVL R9, 12(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 64(SI), DX + XORL 68(SI), CX + XORL 72(SI), R8 + XORL 76(SI), R9 + MOVL DX, 64(DI) + MOVL CX, 68(DI) + MOVL R8, 72(DI) + MOVL R9, 76(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 128(SI), DX + XORL 132(SI), CX + XORL 136(SI), R8 + XORL 140(SI), R9 + MOVL DX, 128(DI) + MOVL CX, 132(DI) + MOVL R8, 136(DI) + MOVL R9, 140(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + XORL 192(SI), DX + XORL 196(SI), CX + XORL 200(SI), R8 + XORL 204(SI), R9 + MOVL DX, 192(DI) + MOVL CX, 196(DI) + MOVL R8, 200(DI) + MOVL R9, 204(DI) + PADDL 240(R12), X14 + PADDL 64(R12), X0 + PADDL 128(R12), X5 + PADDL 192(R12), X8 + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 16(SI), DX + XORL 20(SI), CX + XORL 24(SI), R8 + XORL 28(SI), R9 + MOVL DX, 16(DI) + MOVL CX, 20(DI) + MOVL R8, 24(DI) + MOVL R9, 28(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 80(SI), DX + XORL 84(SI), CX + XORL 88(SI), R8 + XORL 92(SI), R9 + MOVL DX, 80(DI) + MOVL CX, 84(DI) + MOVL R8, 88(DI) + MOVL R9, 92(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 144(SI), DX + XORL 148(SI), CX + XORL 152(SI), R8 + XORL 156(SI), R9 + MOVL DX, 144(DI) + MOVL CX, 148(DI) + MOVL R8, 152(DI) + MOVL R9, 156(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + XORL 208(SI), DX + XORL 212(SI), CX + XORL 216(SI), R8 + XORL 220(SI), R9 + MOVL DX, 208(DI) + MOVL CX, 212(DI) + MOVL R8, 216(DI) + MOVL R9, 220(DI) + PADDL 288(R12), X15 + PADDL 304(R12), X11 + PADDL 80(R12), X1 + PADDL 144(R12), X6 + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 32(SI), DX + XORL 36(SI), CX + XORL 40(SI), R8 + XORL 44(SI), R9 + MOVL DX, 32(DI) + MOVL CX, 36(DI) + MOVL R8, 40(DI) + MOVL R9, 44(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 96(SI), DX + XORL 100(SI), CX + XORL 104(SI), R8 + XORL 108(SI), R9 + MOVL DX, 96(DI) + MOVL CX, 100(DI) + MOVL R8, 104(DI) + MOVL R9, 108(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 160(SI), DX + XORL 164(SI), CX + XORL 168(SI), R8 + XORL 172(SI), R9 + MOVL DX, 160(DI) + MOVL CX, 164(DI) + MOVL R8, 168(DI) + MOVL R9, 172(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + XORL 224(SI), DX + XORL 228(SI), CX + XORL 232(SI), R8 + XORL 236(SI), R9 + MOVL DX, 224(DI) + MOVL CX, 228(DI) + MOVL R8, 232(DI) + MOVL R9, 236(DI) + PADDL 160(R12), X13 + PADDL 208(R12), X9 + PADDL 256(R12), X3 + PADDL 96(R12), X2 + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 48(SI), DX + XORL 52(SI), CX + XORL 56(SI), R8 + XORL 60(SI), R9 + MOVL DX, 48(DI) + MOVL CX, 52(DI) + MOVL R8, 56(DI) + MOVL R9, 60(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 112(SI), DX + XORL 116(SI), CX + XORL 120(SI), R8 + XORL 124(SI), R9 + MOVL DX, 112(DI) + MOVL CX, 116(DI) + MOVL R8, 120(DI) + MOVL R9, 124(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 176(SI), DX + XORL 180(SI), CX + XORL 184(SI), R8 + XORL 188(SI), R9 + MOVL DX, 176(DI) + MOVL CX, 180(DI) + MOVL R8, 184(DI) + MOVL R9, 188(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + XORL 240(SI), DX + XORL 244(SI), CX + XORL 248(SI), R8 + XORL 252(SI), R9 + MOVL DX, 240(DI) + MOVL CX, 244(DI) + MOVL R8, 248(DI) + MOVL R9, 252(DI) + MOVQ 352(R12), R9 + SUBQ $0x00000100, R9 + ADDQ $0x00000100, SI + ADDQ $0x00000100, DI + CMPQ R9, $0x00000100 + JAE BYTESATLEAST256 + CMPQ R9, $0x00 + JBE DONE - MOVQ DX,R9 - MOVQ CX,DX - MOVQ R8,R10 - CMPQ R9,$0 - JBE DONE - START: - MOVL 20(R10),CX - MOVL 0(R10),R8 - MOVL 0(DX),AX - MOVL 16(R10),R11 - MOVL CX,0(R12) - MOVL R8, 4 (R12) - MOVL AX, 8 (R12) - MOVL R11, 12 (R12) - MOVL 8(DX),CX - MOVL 24(R10),R8 - MOVL 4(R10),AX - MOVL 4(DX),R11 - MOVL CX,16(R12) - MOVL R8, 20 (R12) - MOVL AX, 24 (R12) - MOVL R11, 28 (R12) - MOVL 12(DX),CX - MOVL 12(R10),DX - MOVL 28(R10),R8 - MOVL 8(R10),AX - MOVL DX,32(R12) - MOVL CX, 36 (R12) - MOVL R8, 40 (R12) - MOVL AX, 44 (R12) - MOVQ $1634760805,DX - MOVQ $857760878,CX - MOVQ $2036477234,R8 - MOVQ $1797285236,AX - MOVL DX,48(R12) - MOVL CX, 52 (R12) - MOVL R8, 56 (R12) - MOVL AX, 60 (R12) - CMPQ R9,$256 - JB BYTESBETWEEN1AND255 - MOVOA 48(R12),X0 - PSHUFL $0X55,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X3 - PSHUFL $0X00,X0,X0 - MOVOA X1,64(R12) - MOVOA X2,80(R12) - MOVOA X3,96(R12) - MOVOA X0,112(R12) - MOVOA 0(R12),X0 - PSHUFL $0XAA,X0,X1 - PSHUFL $0XFF,X0,X2 - PSHUFL $0X00,X0,X3 - PSHUFL $0X55,X0,X0 - MOVOA X1,128(R12) - MOVOA X2,144(R12) - MOVOA X3,160(R12) - MOVOA X0,176(R12) - MOVOA 16(R12),X0 - PSHUFL $0XFF,X0,X1 - PSHUFL $0X55,X0,X2 - PSHUFL $0XAA,X0,X0 - MOVOA X1,192(R12) - MOVOA X2,208(R12) - MOVOA X0,224(R12) - MOVOA 32(R12),X0 - PSHUFL $0X00,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X0 - MOVOA X1,240(R12) - MOVOA X2,256(R12) - MOVOA X0,272(R12) - BYTESATLEAST256: - MOVL 16(R12),DX - MOVL 36 (R12),CX - MOVL DX,288(R12) - MOVL CX,304(R12) - SHLQ $32,CX - ADDQ CX,DX - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 292 (R12) - MOVL CX, 308 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 296 (R12) - MOVL CX, 312 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 300 (R12) - MOVL CX, 316 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX,16(R12) - MOVL CX, 36 (R12) - MOVQ R9,352(R12) - MOVQ $20,DX - MOVOA 64(R12),X0 - MOVOA 80(R12),X1 - MOVOA 96(R12),X2 - MOVOA 256(R12),X3 - MOVOA 272(R12),X4 - MOVOA 128(R12),X5 - MOVOA 144(R12),X6 - MOVOA 176(R12),X7 - MOVOA 192(R12),X8 - MOVOA 208(R12),X9 - MOVOA 224(R12),X10 - MOVOA 304(R12),X11 - MOVOA 112(R12),X12 - MOVOA 160(R12),X13 - MOVOA 240(R12),X14 - MOVOA 288(R12),X15 - MAINLOOP1: - MOVOA X1,320(R12) - MOVOA X2,336(R12) - MOVOA X13,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X14 - PSRLL $25,X2 - PXOR X2,X14 - MOVOA X7,X1 - PADDL X0,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X11 - PSRLL $25,X2 - PXOR X2,X11 - MOVOA X12,X1 - PADDL X14,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X15 - PSRLL $23,X2 - PXOR X2,X15 - MOVOA X0,X1 - PADDL X11,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X9 - PSRLL $23,X2 - PXOR X2,X9 - MOVOA X14,X1 - PADDL X15,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X13 - PSRLL $19,X2 - PXOR X2,X13 - MOVOA X11,X1 - PADDL X9,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X7 - PSRLL $19,X2 - PXOR X2,X7 - MOVOA X15,X1 - PADDL X13,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA 320(R12),X1 - MOVOA X12,320(R12) - MOVOA X9,X2 - PADDL X7,X2 - MOVOA X2,X12 - PSLLL $18,X2 - PXOR X2,X0 - PSRLL $14,X12 - PXOR X12,X0 - MOVOA X5,X2 - PADDL X1,X2 - MOVOA X2,X12 - PSLLL $7,X2 - PXOR X2,X3 - PSRLL $25,X12 - PXOR X12,X3 - MOVOA 336(R12),X2 - MOVOA X0,336(R12) - MOVOA X6,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X4 - PSRLL $25,X12 - PXOR X12,X4 - MOVOA X1,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X10 - PSRLL $23,X12 - PXOR X12,X10 - MOVOA X2,X0 - PADDL X4,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X8 - PSRLL $23,X12 - PXOR X12,X8 - MOVOA X3,X0 - PADDL X10,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X5 - PSRLL $19,X12 - PXOR X12,X5 - MOVOA X4,X0 - PADDL X8,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X6 - PSRLL $19,X12 - PXOR X12,X6 - MOVOA X10,X0 - PADDL X5,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA 320(R12),X0 - MOVOA X1,320(R12) - MOVOA X4,X1 - PADDL X0,X1 - MOVOA X1,X12 - PSLLL $7,X1 - PXOR X1,X7 - PSRLL $25,X12 - PXOR X12,X7 - MOVOA X8,X1 - PADDL X6,X1 - MOVOA X1,X12 - PSLLL $18,X1 - PXOR X1,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 336(R12),X12 - MOVOA X2,336(R12) - MOVOA X14,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X5 - PSRLL $25,X2 - PXOR X2,X5 - MOVOA X0,X1 - PADDL X7,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X10 - PSRLL $23,X2 - PXOR X2,X10 - MOVOA X12,X1 - PADDL X5,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X8 - PSRLL $23,X2 - PXOR X2,X8 - MOVOA X7,X1 - PADDL X10,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X4 - PSRLL $19,X2 - PXOR X2,X4 - MOVOA X5,X1 - PADDL X8,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X14 - PSRLL $19,X2 - PXOR X2,X14 - MOVOA X10,X1 - PADDL X4,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X0 - PSRLL $14,X2 - PXOR X2,X0 - MOVOA 320(R12),X1 - MOVOA X0,320(R12) - MOVOA X8,X0 - PADDL X14,X0 - MOVOA X0,X2 - PSLLL $18,X0 - PXOR X0,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA X11,X0 - PADDL X1,X0 - MOVOA X0,X2 - PSLLL $7,X0 - PXOR X0,X6 - PSRLL $25,X2 - PXOR X2,X6 - MOVOA 336(R12),X2 - MOVOA X12,336(R12) - MOVOA X3,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X13 - PSRLL $25,X12 - PXOR X12,X13 - MOVOA X1,X0 - PADDL X6,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X15 - PSRLL $23,X12 - PXOR X12,X15 - MOVOA X2,X0 - PADDL X13,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X9 - PSRLL $23,X12 - PXOR X12,X9 - MOVOA X6,X0 - PADDL X15,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X11 - PSRLL $19,X12 - PXOR X12,X11 - MOVOA X13,X0 - PADDL X9,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X3 - PSRLL $19,X12 - PXOR X12,X3 - MOVOA X15,X0 - PADDL X11,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA X9,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 320(R12),X12 - MOVOA 336(R12),X0 - SUBQ $2,DX - JA MAINLOOP1 - PADDL 112(R12),X12 - PADDL 176(R12),X7 - PADDL 224(R12),X10 - PADDL 272(R12),X4 - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 0(SI),DX - XORL 4(SI),CX - XORL 8(SI),R8 - XORL 12(SI),R9 - MOVL DX,0(DI) - MOVL CX,4(DI) - MOVL R8,8(DI) - MOVL R9,12(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 64(SI),DX - XORL 68(SI),CX - XORL 72(SI),R8 - XORL 76(SI),R9 - MOVL DX,64(DI) - MOVL CX,68(DI) - MOVL R8,72(DI) - MOVL R9,76(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 128(SI),DX - XORL 132(SI),CX - XORL 136(SI),R8 - XORL 140(SI),R9 - MOVL DX,128(DI) - MOVL CX,132(DI) - MOVL R8,136(DI) - MOVL R9,140(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - XORL 192(SI),DX - XORL 196(SI),CX - XORL 200(SI),R8 - XORL 204(SI),R9 - MOVL DX,192(DI) - MOVL CX,196(DI) - MOVL R8,200(DI) - MOVL R9,204(DI) - PADDL 240(R12),X14 - PADDL 64(R12),X0 - PADDL 128(R12),X5 - PADDL 192(R12),X8 - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 16(SI),DX - XORL 20(SI),CX - XORL 24(SI),R8 - XORL 28(SI),R9 - MOVL DX,16(DI) - MOVL CX,20(DI) - MOVL R8,24(DI) - MOVL R9,28(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 80(SI),DX - XORL 84(SI),CX - XORL 88(SI),R8 - XORL 92(SI),R9 - MOVL DX,80(DI) - MOVL CX,84(DI) - MOVL R8,88(DI) - MOVL R9,92(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 144(SI),DX - XORL 148(SI),CX - XORL 152(SI),R8 - XORL 156(SI),R9 - MOVL DX,144(DI) - MOVL CX,148(DI) - MOVL R8,152(DI) - MOVL R9,156(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - XORL 208(SI),DX - XORL 212(SI),CX - XORL 216(SI),R8 - XORL 220(SI),R9 - MOVL DX,208(DI) - MOVL CX,212(DI) - MOVL R8,216(DI) - MOVL R9,220(DI) - PADDL 288(R12),X15 - PADDL 304(R12),X11 - PADDL 80(R12),X1 - PADDL 144(R12),X6 - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 32(SI),DX - XORL 36(SI),CX - XORL 40(SI),R8 - XORL 44(SI),R9 - MOVL DX,32(DI) - MOVL CX,36(DI) - MOVL R8,40(DI) - MOVL R9,44(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 96(SI),DX - XORL 100(SI),CX - XORL 104(SI),R8 - XORL 108(SI),R9 - MOVL DX,96(DI) - MOVL CX,100(DI) - MOVL R8,104(DI) - MOVL R9,108(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 160(SI),DX - XORL 164(SI),CX - XORL 168(SI),R8 - XORL 172(SI),R9 - MOVL DX,160(DI) - MOVL CX,164(DI) - MOVL R8,168(DI) - MOVL R9,172(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - XORL 224(SI),DX - XORL 228(SI),CX - XORL 232(SI),R8 - XORL 236(SI),R9 - MOVL DX,224(DI) - MOVL CX,228(DI) - MOVL R8,232(DI) - MOVL R9,236(DI) - PADDL 160(R12),X13 - PADDL 208(R12),X9 - PADDL 256(R12),X3 - PADDL 96(R12),X2 - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 48(SI),DX - XORL 52(SI),CX - XORL 56(SI),R8 - XORL 60(SI),R9 - MOVL DX,48(DI) - MOVL CX,52(DI) - MOVL R8,56(DI) - MOVL R9,60(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 112(SI),DX - XORL 116(SI),CX - XORL 120(SI),R8 - XORL 124(SI),R9 - MOVL DX,112(DI) - MOVL CX,116(DI) - MOVL R8,120(DI) - MOVL R9,124(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 176(SI),DX - XORL 180(SI),CX - XORL 184(SI),R8 - XORL 188(SI),R9 - MOVL DX,176(DI) - MOVL CX,180(DI) - MOVL R8,184(DI) - MOVL R9,188(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - XORL 240(SI),DX - XORL 244(SI),CX - XORL 248(SI),R8 - XORL 252(SI),R9 - MOVL DX,240(DI) - MOVL CX,244(DI) - MOVL R8,248(DI) - MOVL R9,252(DI) - MOVQ 352(R12),R9 - SUBQ $256,R9 - ADDQ $256,SI - ADDQ $256,DI - CMPQ R9,$256 - JAE BYTESATLEAST256 - CMPQ R9,$0 - JBE DONE - BYTESBETWEEN1AND255: - CMPQ R9,$64 - JAE NOCOPY - MOVQ DI,DX - LEAQ 360(R12),DI - MOVQ R9,CX +BYTESBETWEEN1AND255: + CMPQ R9, $0x40 + JAE NOCOPY + MOVQ DI, DX + LEAQ 360(R12), DI + MOVQ R9, CX REP; MOVSB - LEAQ 360(R12),DI - LEAQ 360(R12),SI - NOCOPY: - MOVQ R9,352(R12) - MOVOA 48(R12),X0 - MOVOA 0(R12),X1 - MOVOA 16(R12),X2 - MOVOA 32(R12),X3 - MOVOA X1,X4 - MOVQ $20,CX - MAINLOOP2: - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - SUBQ $4,CX - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PXOR X7,X7 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - JA MAINLOOP2 - PADDL 48(R12),X0 - PADDL 0(R12),X1 - PADDL 16(R12),X2 - PADDL 32(R12),X3 - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 0(SI),CX - XORL 48(SI),R8 - XORL 32(SI),R9 - XORL 16(SI),AX - MOVL CX,0(DI) - MOVL R8,48(DI) - MOVL R9,32(DI) - MOVL AX,16(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 20(SI),CX - XORL 4(SI),R8 - XORL 52(SI),R9 - XORL 36(SI),AX - MOVL CX,20(DI) - MOVL R8,4(DI) - MOVL R9,52(DI) - MOVL AX,36(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 40(SI),CX - XORL 24(SI),R8 - XORL 8(SI),R9 - XORL 56(SI),AX - MOVL CX,40(DI) - MOVL R8,24(DI) - MOVL R9,8(DI) - MOVL AX,56(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - XORL 60(SI),CX - XORL 44(SI),R8 - XORL 28(SI),R9 - XORL 12(SI),AX - MOVL CX,60(DI) - MOVL R8,44(DI) - MOVL R9,28(DI) - MOVL AX,12(DI) - MOVQ 352(R12),R9 - MOVL 16(R12),CX - MOVL 36 (R12),R8 - ADDQ $1,CX - SHLQ $32,R8 - ADDQ R8,CX - MOVQ CX,R8 - SHRQ $32,R8 - MOVL CX,16(R12) - MOVL R8, 36 (R12) - CMPQ R9,$64 - JA BYTESATLEAST65 - JAE BYTESATLEAST64 - MOVQ DI,SI - MOVQ DX,DI - MOVQ R9,CX + LEAQ 360(R12), DI + LEAQ 360(R12), SI + +NOCOPY: + MOVQ R9, 352(R12) + MOVOA 48(R12), X0 + MOVOA (R12), X1 + MOVOA 16(R12), X2 + MOVOA 32(R12), X3 + MOVOA X1, X4 + MOVQ $0x00000014, CX + +MAINLOOP2: + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + SUBQ $0x04, CX + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PXOR X7, X7 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + JA MAINLOOP2 + PADDL 48(R12), X0 + PADDL (R12), X1 + PADDL 16(R12), X2 + PADDL 32(R12), X3 + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL (SI), CX + XORL 48(SI), R8 + XORL 32(SI), R9 + XORL 16(SI), AX + MOVL CX, (DI) + MOVL R8, 48(DI) + MOVL R9, 32(DI) + MOVL AX, 16(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 20(SI), CX + XORL 4(SI), R8 + XORL 52(SI), R9 + XORL 36(SI), AX + MOVL CX, 20(DI) + MOVL R8, 4(DI) + MOVL R9, 52(DI) + MOVL AX, 36(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 40(SI), CX + XORL 24(SI), R8 + XORL 8(SI), R9 + XORL 56(SI), AX + MOVL CX, 40(DI) + MOVL R8, 24(DI) + MOVL R9, 8(DI) + MOVL AX, 56(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + XORL 60(SI), CX + XORL 44(SI), R8 + XORL 28(SI), R9 + XORL 12(SI), AX + MOVL CX, 60(DI) + MOVL R8, 44(DI) + MOVL R9, 28(DI) + MOVL AX, 12(DI) + MOVQ 352(R12), R9 + MOVL 16(R12), CX + MOVL 36(R12), R8 + ADDQ $0x01, CX + SHLQ $0x20, R8 + ADDQ R8, CX + MOVQ CX, R8 + SHRQ $0x20, R8 + MOVL CX, 16(R12) + MOVL R8, 36(R12) + CMPQ R9, $0x40 + JA BYTESATLEAST65 + JAE BYTESATLEAST64 + MOVQ DI, SI + MOVQ DX, DI + MOVQ R9, CX REP; MOVSB - BYTESATLEAST64: - DONE: + +BYTESATLEAST64: +DONE: RET - BYTESATLEAST65: - SUBQ $64,R9 - ADDQ $64,DI - ADDQ $64,SI - JMP BYTESBETWEEN1AND255 + +BYTESATLEAST65: + SUBQ $0x40, R9 + ADDQ $0x40, DI + ADDQ $0x40, SI + JMP BYTESBETWEEN1AND255 From 7eace71069e621a910a5158a1b46314d38f724ae Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Mon, 29 Jul 2024 17:49:34 -0400 Subject: [PATCH 75/95] chacha20poly1305: Avo port of chacha20poly1305_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Parameter metadata not found in the reference assembly file has been added, leading to a diff on the lines where those symbols are referenced. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="chacha20poly1305/chacha20poly1305_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) 155,157c155,157 < MOVQ dst(FP), DI < MOVQ key+24(FP), R8 < MOVQ src+48(FP), SI --- > MOVQ dst_base(FP), DI > MOVQ key_base+24(FP), R8 > MOVQ src_base+48(FP), SI 159c159 < MOVQ ad+72(FP), CX --- > MOVQ ad_base+72(FP), CX 4684,4686c4684,4686 < MOVQ dst(FP), DI < MOVQ key+24(FP), R8 < MOVQ src+48(FP), SI --- > MOVQ dst_base(FP), DI > MOVQ key_base+24(FP), R8 > MOVQ src_base+48(FP), SI 4688c4688 < MOVQ ad+72(FP), CX --- > MOVQ ad_base+72(FP), CX Change-Id: Ia3a8e70b7440944ee739499c41ddceb70e054ef9 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601442 Reviewed-by: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov --- .../_asm/chacha20poly1305_amd64_asm.go | 5516 ++++++++ chacha20poly1305/_asm/go.mod | 15 + chacha20poly1305/_asm/go.sum | 12 + chacha20poly1305/chacha20poly1305_amd64.s | 11503 +++++++++++++--- 4 files changed, 14818 insertions(+), 2228 deletions(-) create mode 100644 chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go create mode 100644 chacha20poly1305/_asm/go.mod create mode 100644 chacha20poly1305/_asm/go.sum diff --git a/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go new file mode 100644 index 0000000000..e9ba153b4c --- /dev/null +++ b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go @@ -0,0 +1,5516 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This assembly implementation was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. + +package main + +import ( + "fmt" + "os" + "strings" + + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/chacha20poly1305" +) + +//go:generate go run . -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305 + +var ( + // General register allocation + oup GPPhysical = RDI + inp = RSI + inl = RBX + adp = RCX // free to reuse, after we hash the additional data + keyp = R8 // free to reuse, when we copy the key to stack + itr2 = R9 // general iterator + itr1 = RCX // general iterator + acc0 = R10 + acc1 = R11 + acc2 = R12 + t0 = R13 + t1 = R14 + t2 = R15 + t3 = R8 + + // Register and stack allocation for the SSE code + rStore Mem = Mem{Base: BP}.Offset(0 * 16) + sStore = Mem{Base: BP}.Offset(1 * 16) + state1Store = Mem{Base: BP}.Offset(2 * 16) + state2Store = Mem{Base: BP}.Offset(3 * 16) + tmpStore = Mem{Base: BP}.Offset(4 * 16) + ctr0Store = Mem{Base: BP}.Offset(5 * 16) + ctr1Store = Mem{Base: BP}.Offset(6 * 16) + ctr2Store = Mem{Base: BP}.Offset(7 * 16) + ctr3Store = Mem{Base: BP}.Offset(8 * 16) + A0 VecPhysical = X0 + A1 = X1 + A2 = X2 + B0 = X3 + B1 = X4 + B2 = X5 + C0 = X6 + C1 = X7 + C2 = X8 + D0 = X9 + D1 = X10 + D2 = X11 + T0 = X12 + T1 = X13 + T2 = X14 + T3 = X15 + A3 = T0 + B3 = T1 + C3 = T2 + D3 = T3 + + // Register and stack allocation for the AVX2 code + rsStoreAVX2 Mem = Mem{Base: BP}.Offset(0 * 32) + state1StoreAVX2 = Mem{Base: BP}.Offset(1 * 32) + state2StoreAVX2 = Mem{Base: BP}.Offset(2 * 32) + ctr0StoreAVX2 = Mem{Base: BP}.Offset(3 * 32) + ctr1StoreAVX2 = Mem{Base: BP}.Offset(4 * 32) + ctr2StoreAVX2 = Mem{Base: BP}.Offset(5 * 32) + ctr3StoreAVX2 = Mem{Base: BP}.Offset(6 * 32) + tmpStoreAVX2 = Mem{Base: BP}.Offset(7 * 32) // 256 bytes on stack + AA0 VecPhysical = Y0 + AA1 = Y5 + AA2 = Y6 + AA3 = Y7 + BB0 = Y14 + BB1 = Y9 + BB2 = Y10 + BB3 = Y11 + CC0 = Y12 + CC1 = Y13 + CC2 = Y8 + CC3 = Y15 + DD0 = Y4 + DD1 = Y1 + DD2 = Y2 + DD3 = Y3 + TT0 = DD3 + TT1 = AA3 + TT2 = BB3 + TT3 = CC3 +) + +const ThatPeskyUnicodeDot = "\u00b7" + +func main() { + Package("golang.org/x/crypto/chacha20poly1305") + ConstraintExpr("gc,!purego") + polyHashADInternal() + chacha20Poly1305Open() + chacha20Poly1305Seal() + Generate() + + var internalFunctions []string = []string{"·polyHashADInternal"} + removePeskyUnicodeDot(internalFunctions, "../chacha20poly1305_amd64.s") +} + +// Utility function to emit BYTE instruction +func BYTE(u8 U8) { + Instruction(&ir.Instruction{Opcode: "BYTE", Operands: []Op{u8}}) +} + +// PALIGNR $4, X3, X3 +func shiftB0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X4, X4 +func shiftB1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xe4)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X5, X5 +func shiftB2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X13, X13 +func shiftB3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x04)) +} + +// PALIGNR $8, X6, X6 +func shiftC0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xf6)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X7, X7 +func shiftC1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X8, X8 +func shiftC2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc0)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X14, X14 +func shiftC3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xf6)) + BYTE(U8(0x08)) +} + +// PALIGNR $12, X9, X9 +func shiftD0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc9)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X10, X10 +func shiftD1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xd2)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X11, X11 +func shiftD2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X15, X15 +func shiftD3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X3, X3 +func shiftB0Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X4, X4 +func shiftB1Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xe4)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X5, X5 +func shiftB2Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X13, X13 +func shiftB3Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x0c)) +} + +func shiftC0Right() { + shiftC0Left() +} + +func shiftC1Right() { + shiftC1Left() +} + +func shiftC2Right() { + shiftC2Left() +} + +func shiftC3Right() { + shiftC3Left() +} + +// PALIGNR $4, X9, X9 +func shiftD0Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc9)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X10, X10 +func shiftD1Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xd2)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X11, X11 +func shiftD2Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X15, X15 +func shiftD3Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x04)) +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~SOME MACROS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +// Hack: ROL must be a #define macro as it is referenced by other macros +func defineROL() { + definition := + `#define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R` + Comment("ROL rotates the uint32s in register R left by N bits, using temporary T.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// ROL rotates the uint32s in register R left by N bits, using temporary T. +func ROL(N uint64, R, T VecPhysical) { + // Hack: ROL must be a #define macro as it is referenced by other macros + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL(%s, %s, %s)", I8(N).Asm(), R.Asm(), T.Asm())}) +} + +// Hack to get Avo to generate an #ifdef +// +// ROL16(R, T) definition depends on a compiler flag that specifies amd64 architectural level. +func defineROL16() { + definition := + `#ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif` + + Comment("ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// Hack to emit macro call +// +// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. +func ROL16(R, T VecPhysical) { + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL16(%s, %s)", R.Asm(), T.Asm())}) +} + +// Hack to get Avo to generate an #ifdef +// +// ROL8(R, T) definition depends on a compiler flag that specifies amd64 architectural level. +func defineROL8() { + definition := + `#ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif` + + Comment("ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// Hack to emit macro call +// +// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. +func ROL8(R, T VecPhysical) { + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL8(%s, %s)", R.Asm(), T.Asm())}) +} + +func chachaQR(A, B, C, D, T VecPhysical) { + PADDD(B, A) + PXOR(A, D) + ROL16(D, T) + PADDD(D, C) + PXOR(C, B) + MOVO(B, T) + PSLLL(Imm(12), T) + PSRLL(Imm(20), B) + PXOR(T, B) + PADDD(B, A) + PXOR(A, D) + ROL8(D, T) + PADDD(D, C) + PXOR(C, B) + MOVO(B, T) + PSLLL(Imm(7), T) + PSRLL(Imm(25), B) + PXOR(T, B) +} + +func chachaQR_AVX2(A, B, C, D, T VecPhysical) { + VPADDD(B, A, A) + VPXOR(A, D, D) + rol16 := rol16_DATA() + VPSHUFB(rol16, D, D) + VPADDD(D, C, C) + VPXOR(C, B, B) + VPSLLD(Imm(12), B, T) + VPSRLD(Imm(20), B, B) + VPXOR(T, B, B) + VPADDD(B, A, A) + VPXOR(A, D, D) + rol8 := rol8_DATA() + VPSHUFB(rol8, D, D) + VPADDD(D, C, C) + VPXOR(C, B, B) + VPSLLD(Imm(7), B, T) + VPSRLD(Imm(25), B, B) + VPXOR(T, B, B) +} + +func polyAdd(S Mem) { + ADDQ(S, acc0) + ADCQ(S.Offset(8), acc1) + ADCQ(Imm(1), acc2) +} + +func polyMulStage1() { + MOVQ(Mem{Base: BP}.Offset(0*8), RAX) + MOVQ(RAX, t2) + MULQ(acc0) + MOVQ(RAX, t0) + MOVQ(RDX, t1) + MOVQ(Mem{Base: BP}.Offset(0*8), RAX) + MULQ(acc1) + IMULQ(acc2, t2) + ADDQ(RAX, t1) + ADCQ(RDX, t2) +} + +func polyMulStage2() { + MOVQ(Mem{Base: BP}.Offset(1*8), RAX) + MOVQ(RAX, t3) + MULQ(acc0) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc0) + MOVQ(Mem{Base: BP}.Offset(1*8), RAX) + MULQ(acc1) + ADDQ(RAX, t2) + ADCQ(Imm(0), RDX) +} + +func polyMulStage3() { + IMULQ(acc2, t3) + ADDQ(acc0, t2) + ADCQ(RDX, t3) +} + +func polyMulReduceStage() { + MOVQ(t0, acc0) + MOVQ(t1, acc1) + MOVQ(t2, acc2) + ANDQ(Imm(3), acc2) + MOVQ(t2, t0) + ANDQ(I8(-4), t0) + MOVQ(t3, t1) + SHRQ(Imm(2), t3, t2) + SHRQ(Imm(2), t3) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(0), acc2) + ADDQ(t2, acc0) + ADCQ(t3, acc1) + ADCQ(Imm(0), acc2) +} + +func polyMulStage1_AVX2() { + MOVQ(Mem{Base: BP}.Offset(0*8), RDX) + MOVQ(RDX, t2) + MULXQ(acc0, t0, t1) + IMULQ(acc2, t2) + MULXQ(acc1, RAX, RDX) + ADDQ(RAX, t1) + ADCQ(RDX, t2) +} + +func polyMulStage2_AVX2() { + MOVQ(Mem{Base: BP}.Offset(1*8), RDX) + MULXQ(acc0, acc0, RAX) + ADDQ(acc0, t1) + MULXQ(acc1, acc1, t3) + ADCQ(acc1, t2) + ADCQ(Imm(0), t3) +} + +func polyMulStage3_AVX2() { + IMULQ(acc2, RDX) + ADDQ(RAX, t2) + ADCQ(RDX, t3) +} + +func polyMul() { + polyMulStage1() + polyMulStage2() + polyMulStage3() + polyMulReduceStage() +} + +func polyMulAVX2() { + polyMulStage1_AVX2() + polyMulStage2_AVX2() + polyMulStage3_AVX2() + polyMulReduceStage() +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +func polyHashADInternal() { + Function("polyHashADInternal<>") + Attributes(NOSPLIT) + AllocLocal(0) + + Comment("Hack: Must declare #define macros inside of a function due to Avo constraints") + defineROL() + defineROL8() + defineROL16() + + // adp points to beginning of additional data + // itr2 holds ad length + XORQ(acc0, acc0) + XORQ(acc1, acc1) + XORQ(acc2, acc2) + CMPQ(itr2, Imm(13)) + JNE(LabelRef("hashADLoop")) + + openFastTLSAD() + hashADLoop() + hashADTail() + hashADTailLoop() + hashADTailFinish() + hashADDone() +} + +// Special treatment for the TLS case of 13 bytes +func openFastTLSAD() { + Label("openFastTLSAD") + MOVQ(Mem{Base: adp}, acc0) + MOVQ(Mem{Base: adp}.Offset(5), acc1) + SHRQ(Imm(24), acc1) + MOVQ(U32(1), acc2) + polyMul() + RET() +} + +// Hash in 16 byte chunks +func hashADLoop() { + Label("hashADLoop") + Comment("Hash in 16 byte chunks") + CMPQ(itr2, Imm(16)) + JB(LabelRef("hashADTail")) + polyAdd(Mem{Base: adp}.Offset(0)) + LEAQ(Mem{Base: adp}.Offset(1*16), adp) + SUBQ(Imm(16), itr2) + polyMul() + JMP(LabelRef("hashADLoop")) +} + +func hashADTail() { + Label("hashADTail") + CMPQ(itr2, Imm(0)) + JE(LabelRef("hashADDone")) + + Comment("Hash last < 16 byte tail") + XORQ(t0, t0) + XORQ(t1, t1) + XORQ(t2, t2) + ADDQ(itr2, adp) +} + +func hashADTailLoop() { + Label("hashADTailLoop") + SHLQ(Imm(8), t0, t1) + SHLQ(Imm(8), t0) + // Hack to get Avo to emit: + // MOVB -1(adp), t2 + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: adp}.Offset(-1), t2}}) + XORQ(t2, t0) + DECQ(adp) + DECQ(itr2) + JNE(LabelRef("hashADTailLoop")) +} + +func hashADTailFinish() { + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() +} + +// Finished AD +func hashADDone() { + Label("hashADDone") + RET() +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +// Implements the following function fignature: +// +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +func chacha20Poly1305Open() { + Implement("chacha20Poly1305Open") + Attributes(0) + AllocLocal(288) + + Comment("For aligned stack access") + MOVQ(RSP, RBP) + ADDQ(Imm(32), RBP) + ANDQ(I8(-32), RBP) + + Load(Param("dst").Base(), oup) + Load(Param("key").Base(), keyp) + Load(Param("src").Base(), inp) + Load(Param("src").Len(), inl) + Load(Param("ad").Base(), adp) + + Comment("Check for AVX2 support") + CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("chacha20Poly1305Open_AVX2")) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(128)) + JBE(LabelRef("openSSE128")) // About 16% faster + + Comment("For long buffers, prepare the poly key first") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(D0, T1) + + Comment("Store state on stack for future use") + MOVO(B0, state1Store) + MOVO(C0, state2Store) + MOVO(D0, ctr3Store) + MOVQ(U32(10), itr2) + + openSSEPreparePolyKey() + openSSEMainLoop() + openSSEInternalLoop() + openSSEMainLoopDone() + openSSEFinalize() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 129 bytes + openSSE128() + openSSE128InnerCipherLoop() + openSSE128Open() + openSSETail16() + openSSETail16Store() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 64 bytes of ciphertext + openSSETail64() + openSSETail64LoopA() + openSSETail64LoopB() + openSSETail64DecLoop() + openSSETail64DecLoopDone() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + openSSETail128() + openSSETail128LoopA() + openSSETail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 192 bytes of ciphertext + openSSETail192() + openSSLTail192LoopA() + openSSLTail192LoopB() + openSSLTail192Store() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + openSSETail256() + openSSETail256Loop() + openSSETail256HashLoop() + + // ---------------------------------------------------------------------------- + // ------------------------- AVX2 Code ---------------------------------------- + chacha20Poly1305Open_AVX2() + openAVX2PreparePolyKey() + openAVX2InitialHash64() + openAVX2MainLoop() + openAVX2InternalLoop() + openAVX2MainLoopDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 193 bytes + openAVX2192() + openAVX2192InnerCipherLoop() + openAVX2ShortOpen() + openAVX2ShortOpenLoop() + openAVX2ShortTail32() + openAVX2ShortDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 321 bytes + openAVX2320() + openAVX2320InnerCipherLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + openAVX2Tail128() + openAVX2Tail128LoopA() + openAVX2Tail128LoopB() + openAVX2TailLoop() + openAVX2Tail() + openAVX2TailDone() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + openAVX2Tail256() + openAVX2Tail256LoopA() + openAVX2Tail256LoopB() + openAVX2Tail256Hash() + openAVX2Tail256HashEnd() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 384 bytes of ciphertext + openAVX2Tail384() + openAVX2Tail384LoopB() + openAVX2Tail384LoopA() + openAVX2Tail384Hash() + openAVX2Tail384HashEnd() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 512 bytes of ciphertext + openAVX2Tail512() + openAVX2Tail512LoopB() + openAVX2Tail512LoopA() + openAVX2Tail512HashLoop() + openAVX2Tail512HashEnd() +} + +func openSSEPreparePolyKey() { + Label("openSSEPreparePolyKey") + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + DECQ(itr2) + JNE(LabelRef("openSSEPreparePolyKey")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(state1Store, B0) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVO(A0, rStore) + MOVO(B0, sStore) + + Comment("Hash AAD") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openSSEMainLoop() { + Label("openSSEMainLoop") + CMPQ(inl, U32(256)) + JB(LabelRef("openSSEMainLoopDone")) + + chacha20Constants := chacha20Constants_DATA() + sseIncMask := sseIncMask_DATA() + + Comment("Load state, increment counter blocks") + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + + Comment("There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash") + Comment("2 blocks, and for the remaining 4 only 1 block - for a total of 16") + MOVQ(U32(4), itr1) + MOVQ(inp, itr2) +} + +func openSSEInternalLoop() { + Label("openSSEInternalLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyAdd(Mem{Base: itr2}.Offset(0)) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + polyMulStage3() + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr1) + JGE(LabelRef("openSSEInternalLoop")) + + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMul() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) + + CMPQ(itr1, I8(-6)) + JG(LabelRef("openSSEInternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + Comment("Add in the state") + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + + Comment("Load - xor - store") + MOVO(D3, tmpStore) + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), D0) + PXOR(D0, A1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(Mem{Base: inp}.Offset(5*16), D0) + PXOR(D0, B1) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(Mem{Base: inp}.Offset(6*16), D0) + PXOR(D0, C1) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(D0, D1) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), D0) + PXOR(D0, A2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(Mem{Base: inp}.Offset(9*16), D0) + PXOR(D0, B2) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(Mem{Base: inp}.Offset(10*16), D0) + PXOR(D0, C2) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(D0, D2) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + MOVOU(Mem{Base: inp}.Offset(12*16), D0) + PXOR(D0, A3) + MOVOU(A3, Mem{Base: oup}.Offset(12*16)) + MOVOU(Mem{Base: inp}.Offset(13*16), D0) + PXOR(D0, B3) + MOVOU(B3, Mem{Base: oup}.Offset(13*16)) + MOVOU(Mem{Base: inp}.Offset(14*16), D0) + PXOR(D0, C3) + MOVOU(C3, Mem{Base: oup}.Offset(14*16)) + MOVOU(Mem{Base: inp}.Offset(15*16), D0) + PXOR(tmpStore, D0) + MOVOU(D0, Mem{Base: oup}.Offset(15*16)) + LEAQ(Mem{Base: inp}.Offset(256), inp) + LEAQ(Mem{Base: oup}.Offset(256), oup) + SUBQ(U32(256), inl) + JMP(LabelRef("openSSEMainLoop")) +} + +func openSSEMainLoopDone() { + Label("openSSEMainLoopDone") + Comment("Handle the various tail sizes efficiently") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + CMPQ(inl, Imm(64)) + JBE(LabelRef("openSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("openSSETail128")) + CMPQ(inl, Imm(192)) + JBE(LabelRef("openSSETail192")) + JMP(LabelRef("openSSETail256")) +} + +func openSSEFinalize() { + Label("openSSEFinalize") + Comment("Hash in the PT, AAD lengths") + ADDQ(NewParamAddr("ad_len", 80), acc0) + ADCQ(NewParamAddr("src_len", 56), acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Final reduce") + MOVQ(acc0, t0) + MOVQ(acc1, t1) + MOVQ(acc2, t2) + SUBQ(I8(-5), acc0) + SBBQ(I8(-1), acc1) + SBBQ(Imm(3), acc2) + CMOVQCS(t0, acc0) + CMOVQCS(t1, acc1) + CMOVQCS(t2, acc2) + + Comment("Add in the \"s\" part of the key") + ADDQ(sStore.Offset(0), acc0) + ADCQ(sStore.Offset(8), acc1) + + Comment("Finally, constant time compare to the tag at the end of the message") + XORQ(RAX, RAX) + MOVQ(U32(1), RDX) + XORQ(Mem{Base: inp}.Offset(0*8), acc0) + XORQ(Mem{Base: inp}.Offset(1*8), acc1) + ORQ(acc1, acc0) + CMOVQEQ(RDX, RAX) + + Comment("Return true iff tags are equal") + // Hack to get Avo to emit: + // MOVB AX, ret+96(FP) + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{AX, NewParamAddr("ret", 96)}}) + RET() +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 129 bytes + +// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks +func openSSE128() { + Label("openSSE128") + + chacha20Constants := chacha20Constants_DATA() + sseIncMask := sseIncMask_DATA() + + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(B0, T1) + MOVO(C0, T2) + MOVO(D1, T3) + MOVQ(U32(10), itr2) +} + +func openSSE128InnerCipherLoop() { + Label("openSSE128InnerCipherLoop") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + DECQ(itr2) + JNE(LabelRef("openSSE128InnerCipherLoop")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(T1, B0) + PADDL(T1, B1) + PADDL(T1, B2) + PADDL(T2, C1) + PADDL(T2, C2) + PADDL(T3, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, T3) + PADDL(T3, D2) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVOU(A0, rStore) + MOVOU(B0, sStore) + + Comment("Hash") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openSSE128Open() { + Label("openSSE128Open") + CMPQ(inl, Imm(16)) + JB(LabelRef("openSSETail16")) + SUBQ(Imm(16), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0)) + + Comment("Load for decryption") + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A1) + MOVOU(A1, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + polyMul() + + Comment("Shift the stream \"left\"") + MOVO(B1, A1) + MOVO(C1, B1) + MOVO(D1, C1) + MOVO(A2, D1) + MOVO(B2, A2) + MOVO(C2, B2) + MOVO(D2, C2) + JMP(LabelRef("openSSE128Open")) +} + +func openSSETail16() { + Label("openSSETail16") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + + Comment("We can safely load the CT from the end, because it is padded with the MAC") + MOVQ(inl, itr2) + SHLQ(Imm(4), itr2) + andMask := andMask_DATA() + LEAQ(andMask, t0) + MOVOU(Mem{Base: inp}, T0) + ADDQ(inl, inp) + PAND(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0) + MOVO(T0, tmpStore.Offset(0)) + MOVQ(T0, t0) + MOVQ(tmpStore.Offset(8), t1) + PXOR(A1, T0) +} + +func openSSETail16Store() { + Comment("We can only store one byte at a time, since plaintext can be shorter than 16 bytes") + Label("openSSETail16Store") + MOVQ(T0, t3) + // Hack to get Avo to emit: + // MOVB t3, (oup) + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{t3, Mem{Base: oup}}}) + PSRLDQ(Imm(1), T0) + INCQ(oup) + DECQ(inl) + JNE(LabelRef("openSSETail16Store")) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + JMP(LabelRef("openSSEFinalize")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of ciphertext + +// Need to decrypt up to 64 bytes - prepare single block +func openSSETail64() { + Label("openSSETail64") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + XORQ(itr2, itr2) + MOVQ(inl, itr1) + CMPQ(itr1, Imm(16)) + JB(LabelRef("openSSETail64LoopB")) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSETail64LoopA() { + Label("openSSETail64LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() + SUBQ(Imm(16), itr1) +} + +func openSSETail64LoopB() { + Label("openSSETail64LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + + CMPQ(itr1, Imm(16)) + JAE(LabelRef("openSSETail64LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSETail64LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(state1Store, B0) + PADDL(state2Store, C0) + PADDL(ctr0Store, D0) +} + +func openSSETail64DecLoop() { + Label("openSSETail64DecLoop") + CMPQ(inl, Imm(16)) + JB(LabelRef("openSSETail64DecLoopDone")) + SUBQ(Imm(16), inl) + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A0) + MOVOU(A0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(16), inp) + LEAQ(Mem{Base: oup}.Offset(16), oup) + MOVO(B0, A0) + MOVO(C0, B0) + MOVO(D0, C0) + JMP(LabelRef("openSSETail64DecLoop")) +} + +func openSSETail64DecLoopDone() { + Label("openSSETail64DecLoopDone") + MOVO(A0, A1) + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext + +// Need to decrypt up to 128 bytes - prepare two blocks +func openSSETail128() { + Label("openSSETail128") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A1) + MOVO(state1Store, B1) + MOVO(state2Store, C1) + MOVO(ctr3Store, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(D1, ctr0Store) + MOVO(A1, A0) + MOVO(B1, B0) + MOVO(C1, C0) + MOVO(D1, D0) + PADDL(sseIncMask, D0) + MOVO(D0, ctr1Store) + XORQ(itr2, itr2) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSETail128LoopA() { + Label("openSSETail128LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() +} + +func openSSETail128LoopB() { + Label("openSSETail128LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + + CMPQ(itr2, itr1) + JB(LabelRef("openSSETail128LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSETail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(ctr1Store, D0) + PADDL(ctr0Store, D1) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(0*16)) + MOVOU(B1, Mem{Base: oup}.Offset(1*16)) + MOVOU(C1, Mem{Base: oup}.Offset(2*16)) + MOVOU(D1, Mem{Base: oup}.Offset(3*16)) + + SUBQ(Imm(64), inl) + LEAQ(Mem{Base: inp}.Offset(64), inp) + LEAQ(Mem{Base: oup}.Offset(64), oup) + JMP(LabelRef("openSSETail64DecLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of ciphertext + +// Need to decrypt up to 192 bytes - prepare three blocks +func openSSETail192() { + Label("openSSETail192") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A2) + MOVO(state1Store, B2) + MOVO(state2Store, C2) + MOVO(ctr3Store, D2) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D2) + MOVO(D2, ctr0Store) + MOVO(A2, A1) + MOVO(B2, B1) + MOVO(C2, C1) + MOVO(D2, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) + MOVO(A1, A0) + MOVO(B1, B0) + MOVO(C1, C0) + MOVO(D1, D0) + PADDL(sseIncMask, D0) + MOVO(D0, ctr2Store) + + MOVQ(inl, itr1) + MOVQ(U32(160), itr2) + CMPQ(itr1, Imm(160)) + CMOVQGT(itr2, itr1) + ANDQ(I8(-16), itr1) + XORQ(itr2, itr2) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSLTail192LoopA() { + Label("openSSLTail192LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() +} + +func openSSLTail192LoopB() { + Label("openSSLTail192LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + shiftB2Left() + shiftC2Left() + shiftD2Left() + + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + shiftB2Right() + shiftC2Right() + shiftD2Right() + + CMPQ(itr2, itr1) + JB(LabelRef("openSSLTail192LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSLTail192LoopB")) + + CMPQ(inl, Imm(176)) + JB(LabelRef("openSSLTail192Store")) + + polyAdd(Mem{Base: inp}.Offset(160)) + polyMul() + + CMPQ(inl, Imm(192)) + JB(LabelRef("openSSLTail192Store")) + + polyAdd(Mem{Base: inp}.Offset(176)) + polyMul() +} + +func openSSLTail192Store() { + Label("openSSLTail192Store") + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state1Store, B2) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(state2Store, C2) + PADDL(ctr2Store, D0) + PADDL(ctr1Store, D1) + PADDL(ctr0Store, D2) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A2) + PXOR(T1, B2) + PXOR(T2, C2) + PXOR(T3, D2) + MOVOU(A2, Mem{Base: oup}.Offset(0*16)) + MOVOU(B2, Mem{Base: oup}.Offset(1*16)) + MOVOU(C2, Mem{Base: oup}.Offset(2*16)) + MOVOU(D2, Mem{Base: oup}.Offset(3*16)) + + MOVOU(Mem{Base: inp}.Offset(4*16), T0) + MOVOU(Mem{Base: inp}.Offset(5*16), T1) + MOVOU(Mem{Base: inp}.Offset(6*16), T2) + MOVOU(Mem{Base: inp}.Offset(7*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + LEAQ(Mem{Base: oup}.Offset(128), oup) + JMP(LabelRef("openSSETail64DecLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare four blocks +func openSSETail256() { + Label("openSSETail256") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + XORQ(itr2, itr2) +} + +// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication +func openSSETail256Loop() { + Label("openSSETail256Loop") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulStage3() + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + ADDQ(Imm(2*8), itr2) + CMPQ(itr2, Imm(160)) + JB(LabelRef("openSSETail256Loop")) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) +} + +func openSSETail256HashLoop() { + Label("openSSETail256HashLoop") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() + ADDQ(Imm(2*8), itr2) + CMPQ(itr2, itr1) + JB(LabelRef("openSSETail256HashLoop")) + + Comment("Add in the state") + chacha20Constants := chacha20Constants_DATA() + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + MOVO(D3, tmpStore) + + Comment("Load - xor - store") + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), A0) + MOVOU(Mem{Base: inp}.Offset(9*16), B0) + MOVOU(Mem{Base: inp}.Offset(10*16), C0) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + LEAQ(Mem{Base: inp}.Offset(192), inp) + LEAQ(Mem{Base: oup}.Offset(192), oup) + SUBQ(Imm(192), inl) + MOVO(A3, A0) + MOVO(B3, B0) + MOVO(C3, C0) + MOVO(tmpStore, D0) + + JMP(LabelRef("openSSETail64DecLoop")) +} + +// Functions to emit AVX instructions via BYTE directive + +// broadcasti128 16(r8), ymm14 +func VBROADCASTI128_16_R8_YMM14() { + BYTE(U8(0xc4)) + BYTE(U8(0x42)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x70)) + BYTE(U8(0x10)) +} + +// broadcasti128 32(r8), ymm12 +func VBROADCASTI128_32_R8_YMM12() { + BYTE(U8(0xc4)) + BYTE(U8(0x42)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x60)) + BYTE(U8(0x20)) +} + +// broadcasti128 48(r8), ymm4 +func VBROADCASTI128_48_R8_YMM4() { + BYTE(U8(0xc4)) + BYTE(U8(0xc2)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x60)) + BYTE(U8(0x30)) +} + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- + +func chacha20Poly1305Open_AVX2() { + Label("chacha20Poly1305Open_AVX2") + VZEROUPPER() + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VBROADCASTI128_16_R8_YMM14() + VBROADCASTI128_32_R8_YMM12() + VBROADCASTI128_48_R8_YMM4() + avx2InitMask := avx2InitMask_DATA() + VPADDD(avx2InitMask, DD0, DD0) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(192)) + JBE(LabelRef("openAVX2192")) + CMPQ(inl, U32(320)) + JBE(LabelRef("openAVX2320")) + + Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream") + VMOVDQA(BB0, state1StoreAVX2) + VMOVDQA(CC0, state2StoreAVX2) + VMOVDQA(DD0, ctr3StoreAVX2) + MOVQ(U32(10), itr2) +} + +func openAVX2PreparePolyKey() { + Label("openAVX2PreparePolyKey") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + DECQ(itr2) + JNE(LabelRef("openAVX2PreparePolyKey")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(ctr3StoreAVX2, DD0, DD0) + + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for the first 64 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + + Comment("Hash AD + first 64 bytes") + // MOVQ ad_len+80(FP), itr2 + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +func openAVX2InitialHash64() { + Label("openAVX2InitialHash64") + // polyAdd(0(inp)(itr1*1)) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0)) + polyMulAVX2() + ADDQ(Imm(16), itr1) + CMPQ(itr1, Imm(64)) + JNE(LabelRef("openAVX2InitialHash64")) + + Comment("Decrypt the first 64 bytes") + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(1*32)) + LEAQ(Mem{Base: inp}.Offset(2*32), inp) + LEAQ(Mem{Base: oup}.Offset(2*32), oup) + SUBQ(Imm(64), inl) +} + +func openAVX2MainLoop() { + Label("openAVX2MainLoop") + CMPQ(inl, U32(512)) + JB(LabelRef("openAVX2MainLoopDone")) + + Comment("Load state, increment counter blocks, store the incremented counters") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + XORQ(itr1, itr1) +} + +// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications +// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext +func openAVX2InternalLoop() { + Label("openAVX2InternalLoop") + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0 * 8)) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage1_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulStage2_AVX2() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyMulStage3_AVX2() + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(2 * 8)) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage1_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage2_AVX2() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage3_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulReduceStage() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(4 * 8)) + LEAQ(Mem{Base: itr1}.Offset(6*8), itr1) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage1_AVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + polyMulStage2_AVX2() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage3_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + CMPQ(itr1, U32(480)) + JNE(LabelRef("openAVX2InternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + + Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here") + polyAdd(Mem{Base: inp}.Offset(480)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + Comment("and here") + polyAdd(Mem{Base: inp}.Offset(496)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32)) + LEAQ(Mem{Base: inp}.Offset(32*16), inp) + LEAQ(Mem{Base: oup}.Offset(32*16), oup) + SUBQ(U32(32*16), inl) + JMP(LabelRef("openAVX2MainLoop")) +} + +// Handle the various tail sizes efficiently +func openAVX2MainLoopDone() { + Label("openAVX2MainLoopDone") + Comment("Handle the various tail sizes efficiently") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("openAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("openAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("openAVX2Tail384")) + JMP(LabelRef("openAVX2Tail512")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes + +// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks +func openAVX2192() { + Label("openAVX2192") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VMOVDQA(DD0, DD2) + VMOVDQA(DD1, TT3) + MOVQ(U32(10), itr2) +} + +func openAVX2192InnerCipherLoop() { + Label("openAVX2192InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr2) + JNE(LabelRef("openAVX2192InnerCipherLoop")) + VPADDD(AA2, AA0, AA0) + VPADDD(AA2, AA1, AA1) + VPADDD(BB2, BB0, BB0) + VPADDD(BB2, BB1, BB1) + VPADDD(CC2, CC0, CC0) + VPADDD(CC2, CC1, CC1) + VPADDD(DD2, DD0, DD0) + VPADDD(TT3, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 192 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) +} + +func openAVX2ShortOpen() { + Label("openAVX2ShortOpen") + Comment("Hash") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openAVX2ShortOpenLoop() { + Label("openAVX2ShortOpenLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("openAVX2ShortTail32")) + SUBQ(Imm(32), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: inp}.Offset(2 * 8)) + polyMulAVX2() + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + + Comment("Shift stream left") + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + VMOVDQA(AA1, DD0) + VMOVDQA(BB1, AA1) + VMOVDQA(CC1, BB1) + VMOVDQA(DD1, CC1) + VMOVDQA(AA2, DD1) + VMOVDQA(BB2, AA2) + JMP(LabelRef("openAVX2ShortOpenLoop")) +} + +func openAVX2ShortTail32() { + Label("openAVX2ShortTail32") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("openAVX2ShortDone")) + + SUBQ(Imm(16), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0 * 8)) + polyMulAVX2() + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func openAVX2ShortDone() { + Label("openAVX2ShortDone") + VZEROUPPER() + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes + +// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks +func openAVX2320() { + Label("openAVX2320") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(BB0, TT1) + VMOVDQA(CC0, TT2) + VMOVDQA(DD0, TT3) + MOVQ(U32(10), itr2) +} + +func openAVX2320InnerCipherLoop() { + Label("openAVX2320InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr2) + JNE(LabelRef("openAVX2320InnerCipherLoop")) + + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, TT0) + VPADDD(TT0, AA0, AA0) + VPADDD(TT0, AA1, AA1) + VPADDD(TT0, AA2, AA2) + VPADDD(TT1, BB0, BB0) + VPADDD(TT1, BB1, BB1) + VPADDD(TT1, BB2, BB2) + VPADDD(TT2, CC0, CC0) + VPADDD(TT2, CC1, CC1) + VPADDD(TT2, CC2, CC2) + avx2IncMask := avx2IncMask_DATA() + VMOVDQA(avx2IncMask, TT0) + VPADDD(TT3, DD0, DD0) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD1, DD1) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD2, DD2) + + Comment("Clamp and store poly key") + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 320 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) + VPERM2I128(Imm(0x02), AA2, BB2, CC1) + VPERM2I128(Imm(0x02), CC2, DD2, DD1) + VPERM2I128(Imm(0x13), AA2, BB2, AA2) + VPERM2I128(Imm(0x13), CC2, DD2, BB2) + JMP(LabelRef("openAVX2ShortOpen")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext + +// Need to decrypt up to 128 bytes - prepare two blocks +func openAVX2Tail128() { + Label("openAVX2Tail128") + Comment("Need to decrypt up to 128 bytes - prepare two blocks") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA1) + VMOVDQA(state1StoreAVX2, BB1) + VMOVDQA(state2StoreAVX2, CC1) + VMOVDQA(ctr3StoreAVX2, DD1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD1, DD1) + VMOVDQA(DD1, DD0) + + XORQ(itr2, itr2) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) + TESTQ(itr1, itr1) + JE(LabelRef("openAVX2Tail128LoopB")) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail128LoopA() { + Label("openAVX2Tail128LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMulAVX2() +} + +func openAVX2Tail128LoopB() { + Label("openAVX2Tail128LoopB") + ADDQ(Imm(16), itr2) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail128LoopA")) + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openAVX2Tail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(DD0, DD1, DD1) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) +} + +func openAVX2TailLoop() { + Label("openAVX2TailLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("openAVX2Tail")) + SUBQ(Imm(32), inl) + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + JMP(LabelRef("openAVX2TailLoop")) +} + +func openAVX2Tail() { + Label("openAVX2Tail") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("openAVX2TailDone")) + SUBQ(Imm(16), inl) + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func openAVX2TailDone() { + Label("openAVX2TailDone") + VZEROUPPER() + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare four blocks +func openAVX2Tail256() { + Label("openAVX2Tail256") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) + + Comment("Compute the number of iterations that will hash data") + MOVQ(inl, tmpStoreAVX2) + MOVQ(inl, itr1) + SUBQ(Imm(128), itr1) + SHRQ(Imm(4), itr1) + MOVQ(U32(10), itr2) + CMPQ(itr1, Imm(10)) + CMOVQGT(itr2, itr1) + MOVQ(inp, inl) + XORQ(itr2, itr2) +} + +func openAVX2Tail256LoopA() { + Label("openAVX2Tail256LoopA") + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail256LoopB() { + Label("openAVX2Tail256LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + INCQ(itr2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail256LoopA")) + + CMPQ(itr2, Imm(10)) + JNE(LabelRef("openAVX2Tail256LoopB")) + + MOVQ(inl, itr2) + SUBQ(inp, inl) + MOVQ(inl, itr1) + MOVQ(tmpStoreAVX2, inl) +} + +// Hash the remainder of data (if any) +func openAVX2Tail256Hash() { + Label("openAVX2Tail256Hash") + ADDQ(Imm(16), itr1) + CMPQ(itr1, inl) + JGT(LabelRef("openAVX2Tail256HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + JMP(LabelRef("openAVX2Tail256Hash")) +} + +// Store 128 bytes safely, then go to store loop +func openAVX2Tail256HashEnd() { + Label("openAVX2Tail256HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, AA2) + VPERM2I128(Imm(0x02), CC0, DD0, BB2) + VPERM2I128(Imm(0x13), AA0, BB0, CC2) + VPERM2I128(Imm(0x13), CC0, DD0, DD2) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + + VPXOR(Mem{Base: inp}.Offset(0*32), AA2, AA2) + VPXOR(Mem{Base: inp}.Offset(1*32), BB2, BB2) + VPXOR(Mem{Base: inp}.Offset(2*32), CC2, CC2) + VPXOR(Mem{Base: inp}.Offset(3*32), DD2, DD2) + VMOVDQU(AA2, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(BB2, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(CC2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(DD2, Mem{Base: oup}.Offset(3*32)) + LEAQ(Mem{Base: inp}.Offset(4*32), inp) + LEAQ(Mem{Base: oup}.Offset(4*32), oup) + SUBQ(Imm(4*32), inl) + + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext + +// Need to decrypt up to 384 bytes - prepare six blocks +func openAVX2Tail384() { + Label("openAVX2Tail384") + Comment("Need to decrypt up to 384 bytes - prepare six blocks") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + + Comment("Compute the number of iterations that will hash two blocks of data") + MOVQ(inl, tmpStoreAVX2) + MOVQ(inl, itr1) + SUBQ(U32(256), itr1) + SHRQ(Imm(4), itr1) + ADDQ(Imm(6), itr1) + MOVQ(U32(10), itr2) + CMPQ(itr1, Imm(10)) + CMOVQGT(itr2, itr1) + MOVQ(inp, inl) + XORQ(itr2, itr2) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail384LoopB() { + Label("openAVX2Tail384LoopB") + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) +} + +func openAVX2Tail384LoopA() { + Label("openAVX2Tail384LoopA") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) + INCQ(itr2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail384LoopB")) + + CMPQ(itr2, Imm(10)) + JNE(LabelRef("openAVX2Tail384LoopA")) + + MOVQ(inl, itr2) + SUBQ(inp, inl) + MOVQ(inl, itr1) + MOVQ(tmpStoreAVX2, inl) +} + +func openAVX2Tail384Hash() { + Label("openAVX2Tail384Hash") + ADDQ(Imm(16), itr1) + CMPQ(itr1, inl) + JGT(LabelRef("openAVX2Tail384HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + JMP(LabelRef("openAVX2Tail384Hash")) +} + +// Store 256 bytes safely, then go to store loop +func openAVX2Tail384HashEnd() { + Label("openAVX2Tail384HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, TT0) + VPERM2I128(Imm(0x02), CC1, DD1, TT1) + VPERM2I128(Imm(0x13), AA1, BB1, TT2) + VPERM2I128(Imm(0x13), CC1, DD1, TT3) + VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + LEAQ(Mem{Base: inp}.Offset(8*32), inp) + LEAQ(Mem{Base: oup}.Offset(8*32), oup) + SUBQ(U32(8*32), inl) + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext + +func openAVX2Tail512() { + Label("openAVX2Tail512") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + XORQ(itr1, itr1) + MOVQ(inp, itr2) +} + +func openAVX2Tail512LoopB() { + Label("openAVX2Tail512LoopB") + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) +} + +func openAVX2Tail512LoopA() { + Label("openAVX2Tail512LoopA") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyAdd(Mem{Base: itr2}.Offset(0 * 8)) + polyMulAVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: itr2}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(4*8), itr2) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + INCQ(itr1) + CMPQ(itr1, Imm(4)) + JLT(LabelRef("openAVX2Tail512LoopB")) + + CMPQ(itr1, Imm(10)) + JNE(LabelRef("openAVX2Tail512LoopA")) + + MOVQ(inl, itr1) + SUBQ(U32(384), itr1) + ANDQ(I8(-16), itr1) +} + +func openAVX2Tail512HashLoop() { + Label("openAVX2Tail512HashLoop") + TESTQ(itr1, itr1) + JE(LabelRef("openAVX2Tail512HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + SUBQ(Imm(16), itr1) + JMP(LabelRef("openAVX2Tail512HashLoop")) +} + +func openAVX2Tail512HashEnd() { + Label("openAVX2Tail512HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + + LEAQ(Mem{Base: inp}.Offset(12*32), inp) + LEAQ(Mem{Base: oup}.Offset(12*32), oup) + SUBQ(U32(12*32), inl) + + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +// Implements the following function fignature: +// +// func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) +func chacha20Poly1305Seal() { + Implement("chacha20Poly1305Seal") + Attributes(0) + AllocLocal(288) + + MOVQ(RSP, RBP) + ADDQ(Imm(32), RBP) + ANDQ(I32(-32), RBP) + Load(Param("dst").Base(), oup) + Load(Param("key").Base(), keyp) + Load(Param("src").Base(), inp) + Load(Param("src").Len(), inl) + Load(Param("ad").Base(), adp) + + CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("chacha20Poly1305Seal_AVX2")) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSE128")) + + Comment("In the seal case - prepare the poly key + 3 blocks of stream in the first iteration") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + + Comment("Store state on stack for future use") + MOVO(B0, state1Store) + MOVO(C0, state2Store) + + Comment("Load state, increment counter blocks") + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + MOVQ(U32(10), itr2) + + sealSSEIntroLoop() + sealSSEMainLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 64 bytes of plaintext + sealSSETail64() + sealSSETail64LoopA() + sealSSETail64LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of plaintext + sealSSETail128() + sealSSETail128LoopA() + sealSSETail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 192 bytes of plaintext + sealSSETail192() + sealSSETail192LoopA() + sealSSETail192LoopB() + + // ---------------------------------------------------------------------------- + // Special seal optimization for buffers smaller than 129 bytes + sealSSE128() + sealSSE128SealHash() + sealSSE128Seal() + sealSSETail() + sealSSETailLoadLoop() + sealSSEFinalize() + + // ---------------------------------------------------------------------------- + // ------------------------- AVX2 Code ---------------------------------------- + chacha20Poly1305Seal_AVX2() + sealAVX2IntroLoop() + sealAVX2MainLoop() + sealAVX2InternalLoop() + sealAVX2InternalLoopStart() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 193 bytes + seal192AVX2() + sealAVX2192InnerCipherLoop() + sealAVX2ShortSeal() + sealAVX2SealHash() + sealAVX2ShortSealLoop() + sealAVX2ShortTail32() + sealAVX2ShortDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 321 bytes + seal320AVX2() + sealAVX2320InnerCipherLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + sealAVX2Tail128() + sealAVX2Tail128LoopA() + sealAVX2Tail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + sealAVX2Tail256() + sealAVX2Tail256LoopA() + sealAVX2Tail256LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 384 bytes of ciphertext + sealAVX2Tail384() + sealAVX2Tail384LoopA() + sealAVX2Tail384LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 512 bytes of ciphertext + sealAVX2Tail512() + sealAVX2Tail512LoopA() + sealAVX2Tail512LoopB() +} + +func sealSSEIntroLoop() { + Label("sealSSEIntroLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr2) + JNE(LabelRef("sealSSEIntroLoop")) + + Comment("Add in the state") + chacha20Constants := chacha20Constants_DATA() + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVO(A0, rStore) + MOVO(B0, sStore) + + Comment("Hash AAD") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(0*16)) + MOVOU(B1, Mem{Base: oup}.Offset(1*16)) + MOVOU(C1, Mem{Base: oup}.Offset(2*16)) + MOVOU(D1, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(4*16)) + MOVOU(B2, Mem{Base: oup}.Offset(5*16)) + MOVOU(C2, Mem{Base: oup}.Offset(6*16)) + MOVOU(D2, Mem{Base: oup}.Offset(7*16)) + + MOVQ(U32(128), itr1) + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + + MOVO(A3, A1) + MOVO(B3, B1) + MOVO(C3, C1) + MOVO(D3, D1) + + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSE128SealHash")) + + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A3) + PXOR(B0, B3) + PXOR(C0, C3) + PXOR(D0, D3) + MOVOU(A3, Mem{Base: oup}.Offset(8*16)) + MOVOU(B3, Mem{Base: oup}.Offset(9*16)) + MOVOU(C3, Mem{Base: oup}.Offset(10*16)) + MOVOU(D3, Mem{Base: oup}.Offset(11*16)) + + ADDQ(Imm(64), itr1) + SUBQ(Imm(64), inl) + LEAQ(Mem{Base: inp}.Offset(64), inp) + + MOVQ(U32(2), itr1) + MOVQ(U32(8), itr2) + + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSETail128")) + CMPQ(inl, Imm(192)) + JBE(LabelRef("sealSSETail192")) +} + +func sealSSEMainLoop() { + Label("sealSSEMainLoop") + Comment("Load state, increment counter blocks") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + + Label("sealSSEInnerLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyAdd(Mem{Base: oup}.Offset(0)) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + LEAQ(Mem{Base: oup}.Offset(2*8), oup) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + polyMulStage3() + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr2) + JGE(LabelRef("sealSSEInnerLoop")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(2*8), oup) + DECQ(itr1) + JG(LabelRef("sealSSEInnerLoop")) + + Comment("Add in the state") + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + MOVO(D3, tmpStore) + + Comment("Load - xor - store") + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVO(tmpStore, D3) + + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), A0) + MOVOU(Mem{Base: inp}.Offset(9*16), B0) + MOVOU(Mem{Base: inp}.Offset(10*16), C0) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + ADDQ(Imm(192), inp) + MOVQ(U32(192), itr1) + SUBQ(Imm(192), inl) + MOVO(A3, A1) + MOVO(B3, B1) + MOVO(C3, C1) + MOVO(D3, D1) + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSE128SealHash")) + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A3) + PXOR(B0, B3) + PXOR(C0, C3) + PXOR(D0, D3) + MOVOU(A3, Mem{Base: oup}.Offset(12*16)) + MOVOU(B3, Mem{Base: oup}.Offset(13*16)) + MOVOU(C3, Mem{Base: oup}.Offset(14*16)) + MOVOU(D3, Mem{Base: oup}.Offset(15*16)) + LEAQ(Mem{Base: inp}.Offset(64), inp) + SUBQ(Imm(64), inl) + MOVQ(U32(6), itr1) + MOVQ(U32(4), itr2) + CMPQ(inl, Imm(192)) + JG(LabelRef("sealSSEMainLoop")) + + MOVQ(inl, itr1) + TESTQ(inl, inl) + JE(LabelRef("sealSSE128SealHash")) + MOVQ(U32(6), itr1) + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSETail128")) + JMP(LabelRef("sealSSETail192")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of plaintext + +// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes +func sealSSETail64() { + Label("sealSSETail64") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A1) + MOVO(state1Store, B1) + MOVO(state2Store, C1) + MOVO(ctr3Store, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(D1, ctr0Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail64LoopA() { + Label("sealSSETail64LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail64LoopB() { + Label("sealSSETail64LoopB") + chachaQR(A1, B1, C1, D1, T1) + shiftB1Left() + shiftC1Left() + shiftD1Left() + chachaQR(A1, B1, C1, D1, T1) + shiftB1Right() + shiftC1Right() + shiftD1Right() + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + + DECQ(itr1) + JG(LabelRef("sealSSETail64LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail64LoopB")) + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A1) + PADDL(state1Store, B1) + PADDL(state2Store, C1) + PADDL(ctr0Store, D1) + + JMP(LabelRef("sealSSE128Seal")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of plaintext + +// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes +func sealSSETail128() { + Label("sealSSETail128") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail128LoopA() { + Label("sealSSETail128LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail128LoopB() { + Label("sealSSETail128LoopB") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + + DECQ(itr1) + JG(LabelRef("sealSSETail128LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(ctr0Store, D0) + PADDL(ctr1Store, D1) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A0) + PXOR(T1, B0) + PXOR(T2, C0) + PXOR(T3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + + MOVQ(U32(64), itr1) + LEAQ(Mem{Base: inp}.Offset(64), inp) + SUBQ(Imm(64), inl) + + JMP(LabelRef("sealSSE128SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of plaintext + +// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes +func sealSSETail192() { + Label("sealSSETail192") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(D2, ctr2Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail192LoopA() { + Label("sealSSETail192LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail192LoopB() { + Label("sealSSETail192LoopB") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + shiftB2Left() + shiftC2Left() + shiftD2Left() + + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + shiftB2Right() + shiftC2Right() + shiftD2Right() + + DECQ(itr1) + JG(LabelRef("sealSSETail192LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail192LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state1Store, B2) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(state2Store, C2) + PADDL(ctr0Store, D0) + PADDL(ctr1Store, D1) + PADDL(ctr2Store, D2) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A0) + PXOR(T1, B0) + PXOR(T2, C0) + PXOR(T3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), T0) + MOVOU(Mem{Base: inp}.Offset(5*16), T1) + MOVOU(Mem{Base: inp}.Offset(6*16), T2) + MOVOU(Mem{Base: inp}.Offset(7*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + + MOVO(A2, A1) + MOVO(B2, B1) + MOVO(C2, C1) + MOVO(D2, D1) + MOVQ(U32(128), itr1) + LEAQ(Mem{Base: inp}.Offset(128), inp) + SUBQ(Imm(128), inl) + + JMP(LabelRef("sealSSE128SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special seal optimization for buffers smaller than 129 bytes + +// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks +func sealSSE128() { + Label("sealSSE128") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(B0, T1) + MOVO(C0, T2) + MOVO(D1, T3) + MOVQ(U32(10), itr2) + + Label("sealSSE128InnerCipherLoop") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + DECQ(itr2) + JNE(LabelRef("sealSSE128InnerCipherLoop")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(T1, B0) + PADDL(T1, B1) + PADDL(T1, B2) + PADDL(T2, C1) + PADDL(T2, C2) + PADDL(T3, D1) + PADDL(sseIncMask, T3) + PADDL(T3, D2) + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVOU(A0, rStore) + MOVOU(B0, sStore) + + Comment("Hash") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +// itr1 holds the number of bytes encrypted but not yet hashed +func sealSSE128SealHash() { + Label("sealSSE128SealHash") + CMPQ(itr1, Imm(16)) + JB(LabelRef("sealSSE128Seal")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + + SUBQ(Imm(16), itr1) + ADDQ(Imm(16), oup) + + JMP(LabelRef("sealSSE128SealHash")) +} + +func sealSSE128Seal() { + Label("sealSSE128Seal") + CMPQ(inl, Imm(16)) + JB(LabelRef("sealSSETail")) + SUBQ(Imm(16), inl) + + Comment("Load for decryption") + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A1) + MOVOU(A1, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + + Comment("Extract for hashing") + MOVQ(A1, t0) + PSRLDQ(Imm(8), A1) + MOVQ(A1, t1) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Shift the stream \"left\"") + MOVO(B1, A1) + MOVO(C1, B1) + MOVO(D1, C1) + MOVO(A2, D1) + MOVO(B2, A2) + MOVO(C2, B2) + MOVO(D2, C2) + JMP(LabelRef("sealSSE128Seal")) +} + +func sealSSETail() { + Label("sealSSETail") + TESTQ(inl, inl) + JE(LabelRef("sealSSEFinalize")) + + Comment("We can only load the PT one byte at a time to avoid read after end of buffer") + MOVQ(inl, itr2) + SHLQ(Imm(4), itr2) + andMask := andMask_DATA() + LEAQ(andMask, t0) + MOVQ(inl, itr1) + LEAQ(Mem{Base: inp, Index: inl, Scale: 1}.Offset(-1), inp) + XORQ(t2, t2) + XORQ(t3, t3) + XORQ(RAX, RAX) +} + +func sealSSETailLoadLoop() { + Label("sealSSETailLoadLoop") + SHLQ(Imm(8), t2, t3) + SHLQ(Imm(8), t2) + // Hack to get Avo to emit: + // MOVB (inp), AX + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: inp}, AX}}) + XORQ(RAX, t2) + LEAQ(Mem{Base: inp}.Offset(-1), inp) + DECQ(itr1) + JNE(LabelRef("sealSSETailLoadLoop")) + MOVQ(t2, tmpStore.Offset(0)) + MOVQ(t3, tmpStore.Offset(8)) + PXOR(tmpStore.Offset(0), A1) + MOVOU(A1, Mem{Base: oup}) + MOVOU(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0) + PAND(T0, A1) + MOVQ(A1, t0) + PSRLDQ(Imm(8), A1) + MOVQ(A1, t1) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + + ADDQ(inl, oup) +} + +func sealSSEFinalize() { + Label("sealSSEFinalize") + Comment("Hash in the buffer lengths") + ADDQ(NewParamAddr("ad_len", 80), acc0) + ADCQ(NewParamAddr("src_len", 56), acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Final reduce") + MOVQ(acc0, t0) + MOVQ(acc1, t1) + MOVQ(acc2, t2) + SUBQ(I8(-5), acc0) + SBBQ(I8(-1), acc1) + SBBQ(Imm(3), acc2) + CMOVQCS(t0, acc0) + CMOVQCS(t1, acc1) + CMOVQCS(t2, acc2) + + Comment("Add in the \"s\" part of the key") + ADDQ(sStore.Offset(0), acc0) + ADCQ(sStore.Offset(8), acc1) + + Comment("Finally store the tag at the end of the message") + MOVQ(acc0, Mem{Base: oup}.Offset(0*8)) + MOVQ(acc1, Mem{Base: oup}.Offset(1*8)) + RET() +} + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- + +func chacha20Poly1305Seal_AVX2() { + Label("chacha20Poly1305Seal_AVX2") + VZEROUPPER() + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VBROADCASTI128_16_R8_YMM14() + VBROADCASTI128_32_R8_YMM12() + VBROADCASTI128_48_R8_YMM4() + avx2InitMask := avx2InitMask_DATA() + VPADDD(avx2InitMask, DD0, DD0) + + Comment("Special optimizations, for very short buffers") + CMPQ(inl, U32(192)) + JBE(LabelRef("seal192AVX2")) + CMPQ(inl, U32(320)) + JBE(LabelRef("seal320AVX2")) + + Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream") + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(BB0, state1StoreAVX2) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(CC0, state2StoreAVX2) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, ctr0StoreAVX2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD1, ctr1StoreAVX2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + MOVQ(U32(10), itr2) +} + +func sealAVX2IntroLoop() { + Label("sealAVX2IntroLoop") + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD3, DD3, DD3) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD3, DD3, DD3) + DECQ(itr2) + JNE(LabelRef("sealAVX2IntroLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPERM2I128(Imm(0x02), AA0, BB0, DD0) + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, DD0, DD0) + VMOVDQA(DD0, rsStoreAVX2) + + Comment("Hash AD") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + + Comment("Can store at least 320 bytes") + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), CC0, CC0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(1*32)) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(2*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(3*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(4*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(5*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(3*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(5*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(6*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(7*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(8*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(9*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(7*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(9*32)) + + MOVQ(U32(320), itr1) + SUBQ(U32(320), inl) + LEAQ(Mem{Base: inp}.Offset(320), inp) + + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), CC3, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), CC3, DD3, DD0) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2SealHash")) + + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(2*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(3*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(11*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(13*32)) + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + + MOVQ(U32(8), itr1) + MOVQ(U32(2), itr2) + + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("sealAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("sealAVX2Tail384")) + CMPQ(inl, U32(512)) + JBE(LabelRef("sealAVX2Tail512")) + + Comment("We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop") + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD3, DD3, DD3) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + + SUBQ(Imm(16), oup) // Adjust the pointer + MOVQ(U32(9), itr1) + JMP(LabelRef("sealAVX2InternalLoopStart")) +} + +// Load state, increment counter blocks, store the incremented counters +func sealAVX2MainLoop() { + Label("sealAVX2MainLoop") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + MOVQ(U32(10), itr1) +} + +func sealAVX2InternalLoop() { + Label("sealAVX2InternalLoop") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage1_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulStage2_AVX2() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyMulStage3_AVX2() + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() +} + +func sealAVX2InternalLoopStart() { + Label("sealAVX2InternalLoopStart") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage1_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage2_AVX2() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage3_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulReduceStage() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: oup}.Offset(4 * 8)) + LEAQ(Mem{Base: oup}.Offset(6*8), oup) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage1_AVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + polyMulStage2_AVX2() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage3_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + DECQ(itr1) + JNE(LabelRef("sealAVX2InternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + + Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(4*8), oup) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + Comment("and here") + polyAdd(Mem{Base: oup}.Offset(-2 * 8)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32)) + LEAQ(Mem{Base: inp}.Offset(32*16), inp) + SUBQ(U32(32*16), inl) + CMPQ(inl, U32(512)) + JG(LabelRef("sealAVX2MainLoop")) + + Comment("Tail can only hash 480 bytes") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(32), oup) + + MOVQ(U32(10), itr1) + MOVQ(U32(0), itr2) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("sealAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("sealAVX2Tail384")) + JMP(LabelRef("sealAVX2Tail512")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes + +// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks +func seal192AVX2() { + Label("seal192AVX2") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VMOVDQA(DD0, DD2) + VMOVDQA(DD1, TT3) + MOVQ(U32(10), itr2) +} + +func sealAVX2192InnerCipherLoop() { + Label("sealAVX2192InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr2) + JNE(LabelRef("sealAVX2192InnerCipherLoop")) + VPADDD(AA2, AA0, AA0) + VPADDD(AA2, AA1, AA1) + VPADDD(BB2, BB0, BB0) + VPADDD(BB2, BB1, BB1) + VPADDD(CC2, CC0, CC0) + VPADDD(CC2, CC1, CC1) + VPADDD(DD2, DD0, DD0) + VPADDD(TT3, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 192 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) +} + +func sealAVX2ShortSeal() { + Label("sealAVX2ShortSeal") + Comment("Hash aad") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +func sealAVX2SealHash() { + Label("sealAVX2SealHash") + Comment("itr1 holds the number of bytes encrypted but not yet hashed") + CMPQ(itr1, Imm(16)) + JB(LabelRef("sealAVX2ShortSealLoop")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + SUBQ(Imm(16), itr1) + ADDQ(Imm(16), oup) + JMP(LabelRef("sealAVX2SealHash")) +} + +func sealAVX2ShortSealLoop() { + Label("sealAVX2ShortSealLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("sealAVX2ShortTail32")) + SUBQ(Imm(32), inl) + + Comment("Load for encryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + + Comment("Now can hash") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + + Comment("Shift stream left") + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + VMOVDQA(AA1, DD0) + VMOVDQA(BB1, AA1) + VMOVDQA(CC1, BB1) + VMOVDQA(DD1, CC1) + VMOVDQA(AA2, DD1) + VMOVDQA(BB2, AA2) + JMP(LabelRef("sealAVX2ShortSealLoop")) +} + +func sealAVX2ShortTail32() { + Label("sealAVX2ShortTail32") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("sealAVX2ShortDone")) + + SUBQ(Imm(16), inl) + + Comment("Load for encryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + + Comment("Hash") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func sealAVX2ShortDone() { + Label("sealAVX2ShortDone") + VZEROUPPER() + JMP(LabelRef("sealSSETail")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes + +// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks +func seal320AVX2() { + Label("seal320AVX2") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(BB0, TT1) + VMOVDQA(CC0, TT2) + VMOVDQA(DD0, TT3) + MOVQ(U32(10), itr2) +} + +func sealAVX2320InnerCipherLoop() { + Label("sealAVX2320InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr2) + JNE(LabelRef("sealAVX2320InnerCipherLoop")) + + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, TT0) + VPADDD(TT0, AA0, AA0) + VPADDD(TT0, AA1, AA1) + VPADDD(TT0, AA2, AA2) + VPADDD(TT1, BB0, BB0) + VPADDD(TT1, BB1, BB1) + VPADDD(TT1, BB2, BB2) + VPADDD(TT2, CC0, CC0) + VPADDD(TT2, CC1, CC1) + VPADDD(TT2, CC2, CC2) + avx2IncMask := avx2IncMask_DATA() + VMOVDQA(avx2IncMask, TT0) + VPADDD(TT3, DD0, DD0) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD1, DD1) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD2, DD2) + + Comment("Clamp and store poly key") + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 320 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) + VPERM2I128(Imm(0x02), AA2, BB2, CC1) + VPERM2I128(Imm(0x02), CC2, DD2, DD1) + VPERM2I128(Imm(0x13), AA2, BB2, AA2) + VPERM2I128(Imm(0x13), CC2, DD2, BB2) + JMP(LabelRef("sealAVX2ShortSeal")) +} + +// Need to decrypt up to 128 bytes - prepare two blocks: +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed. +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed. +func sealAVX2Tail128() { + Label("sealAVX2Tail128") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VMOVDQA(DD0, DD1) +} + +func sealAVX2Tail128LoopA() { + Label("sealAVX2Tail128LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail128LoopB() { + Label("sealAVX2Tail128LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail128LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA1) + VPADDD(state1StoreAVX2, BB0, BB1) + VPADDD(state2StoreAVX2, CC0, CC1) + VPADDD(DD1, DD0, DD1) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + JMP(LabelRef("sealAVX2ShortSealLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail256() { + Label("sealAVX2Tail256") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(chacha20Constants, AA1) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(state1StoreAVX2, BB1) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(state2StoreAVX2, CC1) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) +} + +func sealAVX2Tail256LoopA() { + Label("sealAVX2Tail256LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +// LIne 2493 +func sealAVX2Tail256LoopB() { + Label("sealAVX2Tail256LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail256LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail256LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + MOVQ(U32(128), itr1) + LEAQ(Mem{Base: inp}.Offset(128), inp) + SUBQ(Imm(128), inl) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext + +// Need to decrypt up to 384 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail384() { + Label("sealAVX2Tail384") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) + VMOVDQA(DD2, TT3) +} + +func sealAVX2Tail384LoopA() { + Label("sealAVX2Tail384LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail384LoopB() { + Label("sealAVX2Tail384LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail384LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail384LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPADDD(TT3, DD2, DD2) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, TT0) + VPERM2I128(Imm(0x02), CC1, DD1, TT1) + VPERM2I128(Imm(0x13), AA1, BB1, TT2) + VPERM2I128(Imm(0x13), CC1, DD1, TT3) + VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32)) + MOVQ(U32(256), itr1) + LEAQ(Mem{Base: inp}.Offset(256), inp) + SUBQ(U32(256), inl) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext + +// Need to decrypt up to 512 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail512() { + Label("sealAVX2Tail512") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) +} + +func sealAVX2Tail512LoopA() { + Label("sealAVX2Tail512LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail512LoopB() { + Label("sealAVX2Tail512LoopB") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(4*8), oup) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + + DECQ(itr1) + JG(LabelRef("sealAVX2Tail512LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail512LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VPERM2I128(Imm(0x02), CC0, DD0, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(1*32)) + VPERM2I128(Imm(0x13), AA0, BB0, CC3) + VPXOR(Mem{Base: inp}.Offset(2*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(2*32)) + VPERM2I128(Imm(0x13), CC0, DD0, CC3) + VPXOR(Mem{Base: inp}.Offset(3*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(3*32)) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + + MOVQ(U32(384), itr1) + LEAQ(Mem{Base: inp}.Offset(384), inp) + SUBQ(U32(384), inl) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var ( + // Pointers for memoizing DATA section symbols + chacha20Constants_DATA_ptr, + rol16_DATA_ptr, + rol8_DATA_ptr, + sseIncMask_DATA_ptr, + avx2IncMask_DATA_ptr, + avx2InitMask_DATA_ptr, + polyClampMask_DATA_ptr, + andMask_DATA_ptr *Mem +) + +var nothingUpMySleeve = [8]uint32{ + 0x61707865, + 0x3320646e, + 0x79622d32, + 0x6b206574, + 0x61707865, + 0x3320646e, + 0x79622d32, + 0x6b206574, +} + +// ChaCha20 constants +func chacha20Constants_DATA() Mem { + if chacha20Constants_DATA_ptr != nil { + return *chacha20Constants_DATA_ptr + } + + chacha20Constants := GLOBL(ThatPeskyUnicodeDot+"chacha20Constants", NOPTR|RODATA) + chacha20Constants_DATA_ptr = &chacha20Constants + for i, v := range nothingUpMySleeve { + DATA(i*4, U32(v)) + } + return chacha20Constants +} + +var rol16Consts = [4]uint64{ + 0x0504070601000302, + 0x0D0C0F0E09080B0A, + 0x0504070601000302, + 0x0D0C0F0E09080B0A, +} + +// <<< 16 with PSHUFB +func rol16_DATA() Mem { + if rol16_DATA_ptr != nil { + return *rol16_DATA_ptr + } + + rol16 := GLOBL(ThatPeskyUnicodeDot+"rol16", NOPTR|RODATA) + rol16_DATA_ptr = &rol16 + for i, v := range rol16Consts { + DATA(i*8, U64(v)) + } + return rol16 +} + +var rol8Consts = [4]uint64{ + 0x0605040702010003, + 0x0E0D0C0F0A09080B, + 0x0605040702010003, + 0x0E0D0C0F0A09080B, +} + +// <<< 8 with PSHUFB +func rol8_DATA() Mem { + if rol8_DATA_ptr != nil { + return *rol8_DATA_ptr + } + + rol8 := GLOBL(ThatPeskyUnicodeDot+"rol8", NOPTR|RODATA) + rol8_DATA_ptr = &rol8 + for i, v := range rol8Consts { + DATA(i*8, U64(v)) + } + return rol8 +} + +var avx2InitMaskConsts = [4]uint64{ + 0x0, + 0x0, + 0x1, + 0x0, +} + +func avx2InitMask_DATA() Mem { + if avx2InitMask_DATA_ptr != nil { + return *avx2InitMask_DATA_ptr + } + + avx2InitMask := GLOBL(ThatPeskyUnicodeDot+"avx2InitMask", NOPTR|RODATA) + avx2InitMask_DATA_ptr = &avx2InitMask + for i, v := range avx2InitMaskConsts { + DATA(i*8, U64(v)) + } + return avx2InitMask +} + +var avx2IncMaskConsts = [4]uint64{ + 0x2, + 0x0, + 0x2, + 0x0, +} + +func avx2IncMask_DATA() Mem { + if avx2IncMask_DATA_ptr != nil { + return *avx2IncMask_DATA_ptr + } + + avx2IncMask := GLOBL(ThatPeskyUnicodeDot+"avx2IncMask", NOPTR|RODATA) + avx2IncMask_DATA_ptr = &avx2IncMask + for i, v := range avx2IncMaskConsts { + DATA(i*8, U64(v)) + } + return avx2IncMask +} + +var polyClampMaskConsts = [4]uint64{ + 0x0FFFFFFC0FFFFFFF, + 0x0FFFFFFC0FFFFFFC, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, +} + +// Poly1305 key clamp +func polyClampMask_DATA() Mem { + if polyClampMask_DATA_ptr != nil { + return *polyClampMask_DATA_ptr + } + + polyClampMask := GLOBL(ThatPeskyUnicodeDot+"polyClampMask", NOPTR|RODATA) + polyClampMask_DATA_ptr = &polyClampMask + for i, v := range polyClampMaskConsts { + DATA(i*8, U64(v)) + } + return polyClampMask +} + +var sseIncMaskConsts = [2]uint64{ + 0x1, + 0x0, +} + +func sseIncMask_DATA() Mem { + if sseIncMask_DATA_ptr != nil { + return *sseIncMask_DATA_ptr + } + + sseIncMask := GLOBL(ThatPeskyUnicodeDot+"sseIncMask", NOPTR|RODATA) + sseIncMask_DATA_ptr = &sseIncMask + for i, v := range sseIncMaskConsts { + DATA(i*8, U64(v)) + } + return sseIncMask +} + +var andMaskConsts = [30]uint64{ + 0x00000000000000ff, + 0x0000000000000000, + 0x000000000000ffff, + 0x0000000000000000, + 0x0000000000ffffff, + 0x0000000000000000, + 0x00000000ffffffff, + 0x0000000000000000, + 0x000000ffffffffff, + 0x0000000000000000, + 0x0000ffffffffffff, + 0x0000000000000000, + 0x00ffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x00000000000000ff, + 0xffffffffffffffff, + 0x000000000000ffff, + 0xffffffffffffffff, + 0x0000000000ffffff, + 0xffffffffffffffff, + 0x00000000ffffffff, + 0xffffffffffffffff, + 0x000000ffffffffff, + 0xffffffffffffffff, + 0x0000ffffffffffff, + 0xffffffffffffffff, + 0x00ffffffffffffff, +} + +func andMask_DATA() Mem { + if andMask_DATA_ptr != nil { + return *andMask_DATA_ptr + } + + andMask := GLOBL(ThatPeskyUnicodeDot+"andMask", NOPTR|RODATA) + andMask_DATA_ptr = &andMask + for i, v := range andMaskConsts { + DATA(i*8, U64(v)) + } + return andMask +} + +// removePeskyUnicodeDot strips the dot from the relevant TEXT directives such that they +// can exist as internal assembly functions +// +// Avo v0.6.0 does not support the generation of internal assembly functions. Go's unicode +// dot tells the compiler to link a TEXT symbol to a function in the current Go package +// (or another package if specified). Avo unconditionally prepends the unicode dot to all +// TEXT symbols, making it impossible to emit an internal function without this hack. +// +// There is a pending PR to add internal functions to Avo: +// https://github.com/mmcloughlin/avo/pull/443 +// +// If merged it should allow the usage of InternalFunction("NAME") for the specified functions +func removePeskyUnicodeDot(internalFunctions []string, target string) { + bytes, err := os.ReadFile(target) + if err != nil { + panic(err) + } + + content := string(bytes) + + for _, from := range internalFunctions { + to := strings.ReplaceAll(from, ThatPeskyUnicodeDot, "") + content = strings.ReplaceAll(content, from, to) + } + + err = os.WriteFile(target, []byte(content), 0644) + if err != nil { + panic(err) + } +} diff --git a/chacha20poly1305/_asm/go.mod b/chacha20poly1305/_asm/go.mod new file mode 100644 index 0000000000..957baf2a64 --- /dev/null +++ b/chacha20poly1305/_asm/go.mod @@ -0,0 +1,15 @@ +module chacha20poly1305/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/chacha20poly1305/_asm/go.sum b/chacha20poly1305/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/chacha20poly1305/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s index 731d2ac6db..fd5ee845f9 100644 --- a/chacha20poly1305/chacha20poly1305_amd64.s +++ b/chacha20poly1305/chacha20poly1305_amd64.s @@ -1,2715 +1,9762 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. +// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. //go:build gc && !purego #include "textflag.h" -// General register allocation -#define oup DI -#define inp SI -#define inl BX -#define adp CX // free to reuse, after we hash the additional data -#define keyp R8 // free to reuse, when we copy the key to stack -#define itr2 R9 // general iterator -#define itr1 CX // general iterator -#define acc0 R10 -#define acc1 R11 -#define acc2 R12 -#define t0 R13 -#define t1 R14 -#define t2 R15 -#define t3 R8 -// Register and stack allocation for the SSE code -#define rStore (0*16)(BP) -#define sStore (1*16)(BP) -#define state1Store (2*16)(BP) -#define state2Store (3*16)(BP) -#define tmpStore (4*16)(BP) -#define ctr0Store (5*16)(BP) -#define ctr1Store (6*16)(BP) -#define ctr2Store (7*16)(BP) -#define ctr3Store (8*16)(BP) -#define A0 X0 -#define A1 X1 -#define A2 X2 -#define B0 X3 -#define B1 X4 -#define B2 X5 -#define C0 X6 -#define C1 X7 -#define C2 X8 -#define D0 X9 -#define D1 X10 -#define D2 X11 -#define T0 X12 -#define T1 X13 -#define T2 X14 -#define T3 X15 -#define A3 T0 -#define B3 T1 -#define C3 T2 -#define D3 T3 -// Register and stack allocation for the AVX2 code -#define rsStoreAVX2 (0*32)(BP) -#define state1StoreAVX2 (1*32)(BP) -#define state2StoreAVX2 (2*32)(BP) -#define ctr0StoreAVX2 (3*32)(BP) -#define ctr1StoreAVX2 (4*32)(BP) -#define ctr2StoreAVX2 (5*32)(BP) -#define ctr3StoreAVX2 (6*32)(BP) -#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack -#define AA0 Y0 -#define AA1 Y5 -#define AA2 Y6 -#define AA3 Y7 -#define BB0 Y14 -#define BB1 Y9 -#define BB2 Y10 -#define BB3 Y11 -#define CC0 Y12 -#define CC1 Y13 -#define CC2 Y8 -#define CC3 Y15 -#define DD0 Y4 -#define DD1 Y1 -#define DD2 Y2 -#define DD3 Y3 -#define TT0 DD3 -#define TT1 AA3 -#define TT2 BB3 -#define TT3 CC3 -// ChaCha20 constants -DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 -// <<< 16 with PSHUFB -DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -// <<< 8 with PSHUFB -DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B - -DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 -DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 - -DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 -DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 -// Poly1305 key clamp -DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA ·sseIncMask<>+0x00(SB)/8, $0x1 -DATA ·sseIncMask<>+0x08(SB)/8, $0x0 -// To load/store the last < 16 bytes in a buffer -DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 -GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 -GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 -// No PALIGNR in Go ASM yet (but VPALIGNR is present). -#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 -#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 -#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 -#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 -#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 -#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 -#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 -#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 -#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 -#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 -#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 -#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 -#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 -#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 -#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 -#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 -#define shiftC0Right shiftC0Left -#define shiftC1Right shiftC1Left -#define shiftC2Right shiftC2Left -#define shiftC3Right shiftC3Left -#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 -#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 -#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 -#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 - -// Some macros - -// ROL rotates the uint32s in register R left by N bits, using temporary T. -#define ROL(N, R, T) \ - MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R - -// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL16(R, T) PSHUFB ·rol16<>(SB), R -#else -#define ROL16(R, T) ROL(16, R, T) -#endif - -// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL8(R, T) PSHUFB ·rol8<>(SB), R -#else -#define ROL8(R, T) ROL(8, R, T) -#endif - -#define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; ROL16(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; ROL8(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B - -#define chachaQR_AVX2(A, B, C, D, T) \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B - -#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 -#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX -#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 -#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 - -#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 -#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 - -#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage -#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage -// ---------------------------------------------------------------------------- + +// func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // adp points to beginning of additional data - // itr2 holds ad length - XORQ acc0, acc0 - XORQ acc1, acc1 - XORQ acc2, acc2 - CMPQ itr2, $13 - JNE hashADLoop - -openFastTLSAD: - // Special treatment for the TLS case of 13 bytes - MOVQ (adp), acc0 - MOVQ 5(adp), acc1 - SHRQ $24, acc1 - MOVQ $1, acc2 - polyMul + // Hack: Must declare #define macros inside of a function due to Avo constraints + // ROL rotates the uint32s in register R left by N bits, using temporary T. + #define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R + + // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif + + // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif + XORQ R10, R10 + XORQ R11, R11 + XORQ R12, R12 + CMPQ R9, $0x0d + JNE hashADLoop + MOVQ (CX), R10 + MOVQ 5(CX), R11 + SHRQ $0x18, R11 + MOVQ $0x00000001, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 RET hashADLoop: // Hash in 16 byte chunks - CMPQ itr2, $16 - JB hashADTail - polyAdd(0(adp)) - LEAQ (1*16)(adp), adp - SUBQ $16, itr2 - polyMul - JMP hashADLoop + CMPQ R9, $0x10 + JB hashADTail + ADDQ (CX), R10 + ADCQ 8(CX), R11 + ADCQ $0x01, R12 + LEAQ 16(CX), CX + SUBQ $0x10, R9 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP hashADLoop hashADTail: - CMPQ itr2, $0 + CMPQ R9, $0x00 JE hashADDone // Hash last < 16 byte tail - XORQ t0, t0 - XORQ t1, t1 - XORQ t2, t2 - ADDQ itr2, adp + XORQ R13, R13 + XORQ R14, R14 + XORQ R15, R15 + ADDQ R9, CX hashADTailLoop: - SHLQ $8, t0, t1 - SHLQ $8, t0 - MOVB -1(adp), t2 - XORQ t2, t0 - DECQ adp - DECQ itr2 - JNE hashADTailLoop - -hashADTailFinish: - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - // Finished AD + SHLQ $0x08, R13, R14 + SHLQ $0x08, R13 + MOVB -1(CX), R15 + XORQ R15, R13 + DECQ CX + DECQ R9 + JNE hashADTailLoop + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + hashADDone: RET -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Open(dst, key, src, ad []byte) bool -TEXT ·chacha20Poly1305Open(SB), 0, $288-97 +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX // Check for AVX2 support - CMPB ·useAVX2(SB), $1 + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE openSSE128 // About 16% faster + CMPQ BX, $0x80 + JBE openSSE128 // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 - MOVO D0, T1 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X9, X13 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store - MOVO D0, ctr3Store - MOVQ $10, itr2 + MOVO X3, 32(BP) + MOVO X6, 48(BP) + MOVO X9, 128(BP) + MOVQ $0x0000000a, R9 openSSEPreparePolyKey: - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - DECQ itr2 - JNE openSSEPreparePolyKey + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + DECQ R9 + JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore; MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSEMainLoop: - CMPQ inl, $256 + CMPQ BX, $0x00000100 JB openSSEMainLoopDone // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $4, itr1 - MOVQ inp, itr2 + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash + // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $0x00000004, CX + MOVQ SI, R9 openSSEInternalLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(itr2)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(itr2), itr2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr1 - JGE openSSEInternalLoop - - polyAdd(0(itr2)) - polyMul - LEAQ (2*8)(itr2), itr2 - - CMPQ itr1, $-6 - JG openSSEInternalLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(R9), R9 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ CX + JGE openSSEInternalLoop + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + CMPQ CX, $-6 + JG openSSEInternalLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Load - xor - store - MOVO D3, tmpStore - MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) - MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) - MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) - MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) - MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) - MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) - MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) - MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) - MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) - MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) - MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) - MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) - MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) - MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) - LEAQ 256(inp), inp - LEAQ 256(oup), oup - SUBQ $256, inl + MOVO X15, 64(BP) + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU X0, (DI) + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU X3, 16(DI) + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU X6, 32(DI) + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X9, 48(DI) + MOVOU 64(SI), X9 + PXOR X9, X1 + MOVOU X1, 64(DI) + MOVOU 80(SI), X9 + PXOR X9, X4 + MOVOU X4, 80(DI) + MOVOU 96(SI), X9 + PXOR X9, X7 + MOVOU X7, 96(DI) + MOVOU 112(SI), X9 + PXOR X9, X10 + MOVOU X10, 112(DI) + MOVOU 128(SI), X9 + PXOR X9, X2 + MOVOU X2, 128(DI) + MOVOU 144(SI), X9 + PXOR X9, X5 + MOVOU X5, 144(DI) + MOVOU 160(SI), X9 + PXOR X9, X8 + MOVOU X8, 160(DI) + MOVOU 176(SI), X9 + PXOR X9, X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X9 + PXOR X9, X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X9 + PXOR X9, X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X9 + PXOR X9, X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X9 + PXOR 64(BP), X9 + MOVOU X9, 240(DI) + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $64 + CMPQ BX, $0x40 JBE openSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openSSETail128 - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX - MOVQ $1, DX - XORQ (0*8)(inp), acc0 - XORQ (1*8)(inp), acc1 - ORQ acc1, acc0 + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 129 bytes openSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 openSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE openSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore; MOVOU B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSE128Open: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail16 - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0(inp)) + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 // Load for decryption - MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - polyMul + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP openSSE128Open openSSETail16: - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVOU (inp), T0 - ADDQ inl, inp - PAND -16(t0)(itr2*1), T0 - MOVO T0, 0+tmpStore - MOVQ T0, t0 - MOVQ 8+tmpStore, t1 - PXOR A1, T0 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: - MOVQ T0, t3 - MOVB t3, (oup) - PSRLDQ $1, T0 - INCQ oup - DECQ inl + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX JNE openSSETail16Store - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 JMP openSSEFinalize -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of ciphertext openSSETail64: - // Need to decrypt up to 64 bytes - prepare single block - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - XORQ itr2, itr2 - MOVQ inl, itr1 - CMPQ itr1, $16 - JB openSSETail64LoopB + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + XORQ R9, R9 + MOVQ BX, CX + CMPQ CX, $0x10 + JB openSSETail64LoopB openSSETail64LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul - SUBQ $16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX openSSETail64LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - - CMPQ itr1, $16 - JAE openSSETail64LoopA - - CMPQ itr2, $160 - JNE openSSETail64LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + CMPQ CX, $0x10 + JAE openSSETail64LoopA + CMPQ R9, $0xa0 + JNE openSSETail64LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 + PADDL 48(BP), X6 + PADDL 80(BP), X9 openSSETail64DecLoop: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail64DecLoopDone - SUBQ $16, inl - MOVOU (inp), T0 - PXOR T0, A0 - MOVOU A0, (oup) - LEAQ 16(inp), inp - LEAQ 16(oup), oup - MOVO B0, A0 - MOVO C0, B0 - MOVO D0, C0 + SUBQ $0x10, BX + MOVOU (SI), X12 + PXOR X12, X0 + MOVOU X0, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVO X3, X0 + MOVO X6, X3 + MOVO X9, X6 JMP openSSETail64DecLoop openSSETail64DecLoopDone: - MOVO A0, A1 + MOVO X0, X1 JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openSSETail128: - // Need to decrypt up to 128 bytes - prepare two blocks - MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 96(BP) + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX openSSETail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSETail128LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - CMPQ itr2, itr1 - JB openSSETail128LoopA - - CMPQ itr2, $160 - JNE openSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr1Store, D0; PADDL ctr0Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - - SUBQ $64, inl - LEAQ 64(inp), inp - LEAQ 64(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of ciphertext + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + CMPQ R9, CX + JB openSSETail128LoopA + CMPQ R9, $0xa0 + JNE openSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 96(BP), X9 + PADDL 80(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + SUBQ $0x40, BX + LEAQ 64(SI), SI + LEAQ 64(DI), DI + JMP openSSETail64DecLoop + openSSETail192: - // Need to decrypt up to 192 bytes - prepare three blocks - MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store - MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store - - MOVQ inl, itr1 - MOVQ $160, itr2 - CMPQ itr1, $160 - CMOVQGT itr2, itr1 - ANDQ $-16, itr1 - XORQ itr2, itr2 + MOVO ·chacha20Constants<>+0(SB), X2 + MOVO 32(BP), X5 + MOVO 48(BP), X8 + MOVO 128(BP), X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 80(BP) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 112(BP) + MOVQ BX, CX + MOVQ $0x000000a0, R9 + CMPQ CX, $0xa0 + CMOVQGT R9, CX + ANDQ $-16, CX + XORQ R9, R9 openSSLTail192LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - CMPQ itr2, itr1 - JB openSSLTail192LoopA - - CMPQ itr2, $160 - JNE openSSLTail192LoopB - - CMPQ inl, $176 - JB openSSLTail192Store - - polyAdd(160(inp)) - polyMul - - CMPQ inl, $192 - JB openSSLTail192Store - - polyAdd(176(inp)) - polyMul + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + CMPQ R9, CX + JB openSSLTail192LoopA + CMPQ R9, $0xa0 + JNE openSSLTail192LoopB + CMPQ BX, $0xb0 + JB openSSLTail192Store + ADDQ 160(SI), R10 + ADCQ 168(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + CMPQ BX, $0xc0 + JB openSSLTail192Store + ADDQ 176(SI), R10 + ADCQ 184(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192Store: - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 - MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) - - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - SUBQ $128, inl - LEAQ 128(inp), inp - LEAQ 128(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 112(BP), X9 + PADDL 96(BP), X10 + PADDL 80(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X2 + PXOR X13, X5 + PXOR X14, X8 + PXOR X15, X11 + MOVOU X2, (DI) + MOVOU X5, 16(DI) + MOVOU X8, 32(DI) + MOVOU X11, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + LEAQ 128(DI), DI + JMP openSSETail64DecLoop + openSSETail256: - // Need to decrypt up to 256 bytes - prepare four blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - XORQ itr2, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + XORQ R9, R9 openSSETail256Loop: - // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication - polyAdd(0(inp)(itr2*1)) - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulStage3 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - ADDQ $2*8, itr2 - CMPQ itr2, $160 - JB openSSETail256Loop - MOVQ inl, itr1 - ANDQ $-16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + ADDQ $0x10, R9 + CMPQ R9, $0xa0 + JB openSSETail256Loop + MOVQ BX, CX + ANDQ $-16, CX openSSETail256HashLoop: - polyAdd(0(inp)(itr2*1)) - polyMul - ADDQ $2*8, itr2 - CMPQ itr2, itr1 - JB openSSETail256HashLoop + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, R9 + CMPQ R9, CX + JB openSSETail256HashLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - LEAQ 192(inp), inp - LEAQ 192(oup), oup - SUBQ $192, inl - MOVO A3, A0 - MOVO B3, B0 - MOVO C3, C0 - MOVO tmpStore, D0 - - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + LEAQ 192(SI), SI + LEAQ 192(DI), DI + SUBQ $0xc0, BX + MOVO X12, X0 + MOVO X13, X3 + MOVO X14, X6 + MOVO 64(BP), X9 + JMP openSSETail64DecLoop + chacha20Poly1305Open_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openAVX2192 - CMPQ inl, $320 + CMPQ BX, $0x00000140 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, state2StoreAVX2 - VMOVDQA DD0, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 openAVX2PreparePolyKey: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - DECQ itr2 - JNE openAVX2PreparePolyKey - - VPADDD ·chacha20Constants<>(SB), AA0, AA0 - VPADDD state1StoreAVX2, BB0, BB0 - VPADDD state2StoreAVX2, CC0, CC0 - VPADDD ctr3StoreAVX2, DD0, DD0 - - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for the first 64 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX openAVX2InitialHash64: - polyAdd(0(inp)(itr1*1)) - polyMulAVX2 - ADDQ $16, itr1 - CMPQ itr1, $64 - JNE openAVX2InitialHash64 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 // Decrypt the first 64 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), BB0, BB0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU BB0, (1*32)(oup) - LEAQ (2*32)(inp), inp - LEAQ (2*32)(oup), oup - SUBQ $64, inl + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX openAVX2MainLoop: - CMPQ inl, $512 + CMPQ BX, $0x00000200 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX openAVX2InternalLoop: - // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications - // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext - polyAdd(0*8(inp)(itr1*1)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(inp)(itr1*1)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(inp)(itr1*1)) - LEAQ (6*8)(itr1), itr1 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - CMPQ itr1, $480 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 JNE openAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(480(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(496(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - LEAQ (32*16)(oup), oup - SUBQ $(32*16), inl + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openAVX2Tail128 - CMPQ inl, $256 + CMPQ BX, $0x00000100 JBE openAVX2Tail256 - CMPQ inl, $384 + CMPQ BX, $0x00000180 JBE openAVX2Tail384 JMP openAVX2Tail512 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes openAVX2192: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 openAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE openAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 openAVX2ShortOpen: // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 - polyAdd(2*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes openAVX2320: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 openAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE openAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP openAVX2ShortOpen -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD1 - VMOVDQA DD1, DD0 - - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 - TESTQ itr1, itr1 - JE openAVX2Tail128LoopB + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB openAVX2Tail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMulAVX2 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openAVX2Tail128LoopB: - ADDQ $16, itr2 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 - JB openAVX2Tail128LoopA - CMPQ itr2, $160 - JNE openAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC1, CC1 - VPADDD DD0, DD1, DD1 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y13, Y13 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 openAVX2TailLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2Tail - SUBQ $32, inl + SUBQ $0x20, BX // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 JMP openAVX2TailLoop openAVX2Tail: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2TailDone - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare four blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 // Compute the number of iterations that will hash data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $128, itr1 - SHRQ $4, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 openAVX2Tail256LoopA: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX - // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX - CMPQ itr2, $10 - JNE openAVX2Tail256LoopB - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl - - // Hash the remainder of data (if any) openAVX2Tail256Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail256HashEnd - polyAdd (0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail256Hash - -// Store 128 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - - VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 - VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) - LEAQ (4*32)(inp), inp - LEAQ (4*32)(oup), oup - SUBQ $4*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, ctr0StoreAVX2 - VMOVDQA DD1, ctr1StoreAVX2 - VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) // Compute the number of iterations that will hash two blocks of data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $256, itr1 - SHRQ $4, itr1 - ADDQ $6, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 - - // Perform ChaCha rounds, while hashing the remaining input + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + openAVX2Tail384LoopB: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX openAVX2Tail384LoopA: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - - CMPQ itr2, itr1 - JB openAVX2Tail384LoopB - - CMPQ itr2, $10 - JNE openAVX2Tail384LoopA - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX openAVX2Tail384Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail384HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail384Hash - -// Store 256 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash + openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - LEAQ (8*32)(inp), inp - LEAQ (8*32)(oup), oup - SUBQ $8*32, inl + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openAVX2TailLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 - MOVQ inp, itr2 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 openAVX2Tail512LoopB: - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ (2*8)(itr2), itr2 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 openAVX2Tail512LoopA: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(itr2)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(itr2)) - polyMulAVX2 - LEAQ (4*8)(itr2), itr2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - INCQ itr1 - CMPQ itr1, $4 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 JLT openAVX2Tail512LoopB - - CMPQ itr1, $10 - JNE openAVX2Tail512LoopA - - MOVQ inl, itr1 - SUBQ $384, itr1 - ANDQ $-16, itr1 + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX openAVX2Tail512HashLoop: - TESTQ itr1, itr1 + TESTQ CX, CX JE openAVX2Tail512HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - SUBQ $16, itr1 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + SUBQ $0x10, CX JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - LEAQ (12*32)(inp), inp - LEAQ (12*32)(oup), oup - SUBQ $12*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Seal(dst, key, src, ad []byte) -TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 - // For aligned stack access + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop + +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 + +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 + +DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 +DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 +GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 + +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 + +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp - - CMPB ·useAVX2(SB), $1 + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE sealSSE128 // About 15% faster + CMPQ BX, $0x80 + JBE sealSSE128 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store + MOVO X3, 32(BP) + MOVO X6, 48(BP) // Load state, increment counter blocks - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - MOVQ $10, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + MOVQ $0x0000000a, R9 sealSSEIntroLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JNE sealSSEIntroLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JNE sealSSEIntroLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore - MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 - CALL polyHashADInternal<>(SB) - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) - - MOVQ $128, itr1 - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 - - CMPQ inl, $64 - JBE sealSSE128SealHash - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) - - ADDQ $64, itr1 - SUBQ $64, inl - LEAQ 64(inp), inp - - MOVQ $2, itr1 - MOVQ $8, itr2 - - CMPQ inl, $64 - JBE sealSSETail64 - CMPQ inl, $128 - JBE sealSSETail128 - CMPQ inl, $192 - JBE sealSSETail192 + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 64(DI) + MOVOU X5, 80(DI) + MOVOU X8, 96(DI) + MOVOU X11, 112(DI) + MOVQ $0x00000080, CX + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 + JBE sealSSE128SealHash + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 128(DI) + MOVOU X13, 144(DI) + MOVOU X14, 160(DI) + MOVOU X15, 176(DI) + ADDQ $0x40, CX + SUBQ $0x40, BX + LEAQ 64(SI), SI + MOVQ $0x00000002, CX + MOVQ $0x00000008, R9 + CMPQ BX, $0x40 + JBE sealSSETail64 + CMPQ BX, $0x80 + JBE sealSSETail128 + CMPQ BX, $0xc0 + JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) sealSSEInnerLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(oup)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(oup), oup - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JGE sealSSEInnerLoop - polyAdd(0(oup)) - polyMul - LEAQ (2*8)(oup), oup - DECQ itr1 - JG sealSSEInnerLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(DI), DI + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JGE sealSSEInnerLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSEInnerLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVO tmpStore, D3 - - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - ADDQ $192, inp - MOVQ $192, itr1 - SUBQ $192, inl - MOVO A3, A1 - MOVO B3, B1 - MOVO C3, C1 - MOVO D3, D1 - CMPQ inl, $64 + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVO 64(BP), X15 + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + ADDQ $0xc0, SI + MOVQ $0x000000c0, CX + SUBQ $0xc0, BX + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 JBE sealSSE128SealHash - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) - LEAQ 64(inp), inp - SUBQ $64, inl - MOVQ $6, itr1 - MOVQ $4, itr2 - CMPQ inl, $192 + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + LEAQ 64(SI), SI + SUBQ $0x40, BX + MOVQ $0x00000006, CX + MOVQ $0x00000004, R9 + CMPQ BX, $0xc0 JG sealSSEMainLoop - - MOVQ inl, itr1 - TESTQ inl, inl + MOVQ BX, CX + TESTQ BX, BX JE sealSSE128SealHash - MOVQ $6, itr1 - CMPQ inl, $64 + MOVQ $0x00000006, CX + CMPQ BX, $0x40 JBE sealSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE sealSSETail128 JMP sealSSETail192 -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of plaintext sealSSETail64: - // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A1 - MOVO state1Store, B1 - MOVO state2Store, C1 - MOVO ctr3Store, D1 - PADDL ·sseIncMask<>(SB), D1 - MOVO D1, ctr0Store + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) sealSSETail64LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail64LoopB: - chachaQR(A1, B1, C1, D1, T1) - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A1, B1, C1, D1, T1) - shiftB1Right; shiftC1Right; shiftD1Right - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - DECQ itr1 - JG sealSSETail64LoopA - - DECQ itr2 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSETail64LoopA + DECQ R9 JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B1 - PADDL state2Store, C1 - PADDL ctr0Store, D1 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X4 + PADDL 48(BP), X7 + PADDL 80(BP), X10 + JMP sealSSE128Seal - JMP sealSSE128Seal - -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of plaintext sealSSETail128: - // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) sealSSETail128LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail128LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - DECQ itr1 - JG sealSSETail128LoopA - - DECQ itr2 - JGE sealSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr0Store, D0; PADDL ctr1Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - - MOVQ $64, itr1 - LEAQ 64(inp), inp - SUBQ $64, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of plaintext + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + DECQ CX + JG sealSSETail128LoopA + DECQ R9 + JGE sealSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVQ $0x00000040, CX + LEAQ 64(SI), SI + SUBQ $0x40, BX + JMP sealSSE128SealHash + sealSSETail192: - // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 112(BP) sealSSETail192LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail192LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - DECQ itr1 - JG sealSSETail192LoopA - - DECQ itr2 - JGE sealSSETail192LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - MOVO A2, A1 - MOVO B2, B1 - MOVO C2, C1 - MOVO D2, D1 - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special seal optimization for buffers smaller than 129 bytes + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ CX + JG sealSSETail192LoopA + DECQ R9 + JGE sealSSETail192LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + PADDL 112(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + JMP sealSSE128SealHash + sealSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 sealSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE sealSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore - MOVOU B0, sStore + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealSSE128SealHash: - // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealSSE128Seal - polyAdd(0(oup)) - polyMul - - SUBQ $16, itr1 - ADDQ $16, oup - - JMP sealSSE128SealHash + CMPQ CX, $0x10 + JB sealSSE128Seal + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealSSE128SealHash sealSSE128Seal: - CMPQ inl, $16 + CMPQ BX, $0x10 JB sealSSETail - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - MOVOU (inp), T0 - PXOR T0, A1 - MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI // Extract for hashing - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP sealSSE128Seal sealSSETail: - TESTQ inl, inl + TESTQ BX, BX JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVQ inl, itr1 - LEAQ -1(inp)(inl*1), inp - XORQ t2, t2 - XORQ t3, t3 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 XORQ AX, AX sealSSETailLoadLoop: - SHLQ $8, t2, t3 - SHLQ $8, t2 - MOVB (inp), AX - XORQ AX, t2 - LEAQ -1(inp), inp - DECQ itr1 + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX JNE sealSSETailLoadLoop - MOVQ t2, 0+tmpStore - MOVQ t3, 8+tmpStore - PXOR 0+tmpStore, A1 - MOVOU A1, (oup) - MOVOU -16(t0)(itr2*1), T0 - PAND T0, A1 - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - ADDQ inl, oup + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI sealSSEFinalize: // Hash in the buffer lengths - ADDQ ad_len+80(FP), acc0 - ADCQ src_len+56(FP), acc1 - ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally store the tag at the end of the message - MOVQ acc0, (0*8)(oup) - MOVQ acc1, (1*8)(oup) + MOVQ R10, (DI) + MOVQ R11, 8(DI) RET -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers - CMPQ inl, $192 - JBE seal192AVX2 // 33% faster - CMPQ inl, $320 - JBE seal320AVX2 // 17% faster + CMPQ BX, $0x000000c0 + JBE seal192AVX2 + CMPQ BX, $0x00000140 + JBE seal320AVX2 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 - VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA Y12, 64(BP) + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, 96(BP) + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y1, 128(BP) + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, R9 sealAVX2IntroLoop: - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr2 - JNE sealAVX2IntroLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - - VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 - VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key - VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ R9 + JNE sealAVX2IntroLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPERM2I128 $0x02, Y0, Y14, Y4 + VPERM2I128 $0x13, Y0, Y14, Y0 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), DD0, DD0 - VMOVDQA DD0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, (BP) // Hash AD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), CC0, CC0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU CC0, (1*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 - VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 - VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) - - MOVQ $320, itr1 - SUBQ $320, inl - LEAQ 320(inp), inp - - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 - CMPQ inl, $128 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y12, Y12 + VMOVDQU Y0, (DI) + VMOVDQU Y12, 32(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 64(SI), Y0, Y0 + VPXOR 96(SI), Y14, Y14 + VPXOR 128(SI), Y12, Y12 + VPXOR 160(SI), Y4, Y4 + VMOVDQU Y0, 64(DI) + VMOVDQU Y14, 96(DI) + VMOVDQU Y12, 128(DI) + VMOVDQU Y4, 160(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 192(SI), Y0, Y0 + VPXOR 224(SI), Y14, Y14 + VPXOR 256(SI), Y12, Y12 + VPXOR 288(SI), Y4, Y4 + VMOVDQU Y0, 192(DI) + VMOVDQU Y14, 224(DI) + VMOVDQU Y12, 256(DI) + VMOVDQU Y4, 288(DI) + MOVQ $0x00000140, CX + SUBQ $0x00000140, BX + LEAQ 320(SI), SI + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, Y15, Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, Y15, Y3, Y4 + CMPQ BX, $0x80 JBE sealAVX2SealHash - - VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 - VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVQ $8, itr1 - MOVQ $2, itr2 - - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - CMPQ inl, $512 - JBE sealAVX2Tail512 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VPXOR 64(SI), Y12, Y12 + VPXOR 96(SI), Y4, Y4 + VMOVDQU Y0, 320(DI) + VMOVDQU Y14, 352(DI) + VMOVDQU Y12, 384(DI) + VMOVDQU Y4, 416(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVQ $0x00000008, CX + MOVQ $0x00000002, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + CMPQ BX, $0x00000200 + JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - - SUBQ $16, oup // Adjust the pointer - MOVQ $9, itr1 - JMP sealAVX2InternalLoopStart + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + SUBQ $0x10, DI + MOVQ $0x00000009, CX + JMP sealAVX2InternalLoopStart sealAVX2MainLoop: - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, CX sealAVX2InternalLoop: - polyAdd(0*8(oup)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 sealAVX2InternalLoopStart: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(oup)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(oup)) - LEAQ (6*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr1 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(DI), R10 + ADCQ 40(DI), R11 + ADCQ $0x01, R12 + LEAQ 48(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX JNE sealAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(-2*8(oup)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - SUBQ $(32*16), inl - CMPQ inl, $512 + ADDQ -16(DI), R10 + ADCQ -8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + SUBQ $0x00000200, BX + CMPQ BX, $0x00000200 JG sealAVX2MainLoop // Tail can only hash 480 bytes - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ 32(oup), oup - - MOVQ $10, itr1 - MOVQ $0, itr2 - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - JMP sealAVX2Tail512 - -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + MOVQ $0x0000000a, CX + MOVQ $0x00000000, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + seal192AVX2: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 sealAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE sealAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 sealAVX2ShortSeal: // Hash aad - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealAVX2ShortSealLoop - polyAdd(0(oup)) - polyMul - SUBQ $16, itr1 - ADDQ $16, oup - JMP sealAVX2SealHash + CMPQ CX, $0x10 + JB sealAVX2ShortSealLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealAVX2SealHash sealAVX2ShortSealLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB sealAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for encryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI // Now can hash - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (1*32)(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB sealAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for encryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI // Hash - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes seal320AVX2: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 sealAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE sealAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP sealAVX2ShortSeal -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext sealAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0 - VMOVDQA state1StoreAVX2, BB0 - VMOVDQA state2StoreAVX2, CC0 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VMOVDQA DD0, DD1 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA 32(BP), Y14 + VMOVDQA 64(BP), Y12 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, Y1 sealAVX2Tail128LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail128LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $4, DD0, DD0, DD0 - DECQ itr1 - JG sealAVX2Tail128LoopA - DECQ itr2 - JGE sealAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA1 - VPADDD state1StoreAVX2, BB0, BB1 - VPADDD state2StoreAVX2, CC0, CC1 - VPADDD DD1, DD0, DD1 - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ CX + JG sealAVX2Tail128LoopA + DECQ R9 + JGE sealAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 + VPADDD 32(BP), Y14, Y9 + VPADDD 64(BP), Y12, Y13 + VPADDD Y1, Y4, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2ShortSealLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext sealAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 sealAVX2Tail256LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr1 - JG sealAVX2Tail256LoopA - DECQ itr2 - JGE sealAVX2Tail256LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ CX + JG sealAVX2Tail256LoopA + DECQ R9 + JGE sealAVX2Tail256LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + JMP sealAVX2SealHash + sealAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + VMOVDQA Y2, Y15 sealAVX2Tail384LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail384LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr1 - JG sealAVX2Tail384LoopA - DECQ itr2 - JGE sealAVX2Tail384LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0 - VPERM2I128 $0x02, CC1, DD1, TT1 - VPERM2I128 $0x13, AA1, BB1, TT2 - VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - MOVQ $256, itr1 - LEAQ 256(inp), inp - SUBQ $256, inl - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ CX + JG sealAVX2Tail384LoopA + DECQ R9 + JGE sealAVX2Tail384LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPADDD Y15, Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + MOVQ $0x00000100, CX + LEAQ 256(SI), SI + SUBQ $0x00000100, BX + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + JMP sealAVX2SealHash + sealAVX2Tail512: - // Need to decrypt up to 512 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) sealAVX2Tail512LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail512LoopB: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(oup)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - - DECQ itr1 - JG sealAVX2Tail512LoopA - DECQ itr2 - JGE sealAVX2Tail512LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3 - VPXOR (0*32)(inp), CC3, CC3 - VMOVDQU CC3, (0*32)(oup) - VPERM2I128 $0x02, CC0, DD0, CC3 - VPXOR (1*32)(inp), CC3, CC3 - VMOVDQU CC3, (1*32)(oup) - VPERM2I128 $0x13, AA0, BB0, CC3 - VPXOR (2*32)(inp), CC3, CC3 - VMOVDQU CC3, (2*32)(oup) - VPERM2I128 $0x13, CC0, DD0, CC3 - VPXOR (3*32)(inp), CC3, CC3 - VMOVDQU CC3, (3*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - - MOVQ $384, itr1 - LEAQ 384(inp), inp - SUBQ $384, inl - VPERM2I128 $0x02, AA3, BB3, AA0 - VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 - VPERM2I128 $0x13, AA3, BB3, CC0 - VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - JMP sealAVX2SealHash + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX + JG sealAVX2Tail512LoopA + DECQ R9 + JGE sealAVX2Tail512LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPXOR (SI), Y15, Y15 + VMOVDQU Y15, (DI) + VPERM2I128 $0x02, Y12, Y4, Y15 + VPXOR 32(SI), Y15, Y15 + VMOVDQU Y15, 32(DI) + VPERM2I128 $0x13, Y0, Y14, Y15 + VPXOR 64(SI), Y15, Y15 + VMOVDQU Y15, 64(DI) + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + MOVQ $0x00000180, CX + LEAQ 384(SI), SI + SUBQ $0x00000180, BX + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + JMP sealAVX2SealHash From bcb0f91bbceb3486cc7f10102ff046661fb4d364 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Sun, 21 Jul 2024 14:23:57 -0400 Subject: [PATCH 76/95] internal/poly1305: Port sum_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="internal/poly1305/sum_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I80212c95d1b05335d7f6b73a3030b6f812f6105b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600035 Reviewed-by: Roland Shoemaker Reviewed-by: Filippo Valsorda Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI --- internal/poly1305/_asm/go.mod | 15 +++ internal/poly1305/_asm/go.sum | 12 +++ internal/poly1305/_asm/sum_amd64_asm.go | 126 ++++++++++++++++++++++ internal/poly1305/sum_amd64.s | 133 +++++++++++------------- 4 files changed, 212 insertions(+), 74 deletions(-) create mode 100644 internal/poly1305/_asm/go.mod create mode 100644 internal/poly1305/_asm/go.sum create mode 100644 internal/poly1305/_asm/sum_amd64_asm.go diff --git a/internal/poly1305/_asm/go.mod b/internal/poly1305/_asm/go.mod new file mode 100644 index 0000000000..47f2b758ef --- /dev/null +++ b/internal/poly1305/_asm/go.mod @@ -0,0 +1,15 @@ +module internal/poly1305/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/internal/poly1305/_asm/go.sum b/internal/poly1305/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/internal/poly1305/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/internal/poly1305/_asm/sum_amd64_asm.go b/internal/poly1305/_asm/sum_amd64_asm.go new file mode 100644 index 0000000000..a445c68f01 --- /dev/null +++ b/internal/poly1305/_asm/sum_amd64_asm.go @@ -0,0 +1,126 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/sha3" +) + +//go:generate go run . -out ../sum_amd64.s -pkg poly1305 + +func main() { + Package("golang.org/x/crypto/internal/poly1305") + ConstraintExpr("gc,!purego") + update() + Generate() +} + +func update() { + Implement("update") + + Load(Param("state"), RDI) + MOVQ(NewParamAddr("msg_base", 8), RSI) + MOVQ(NewParamAddr("msg_len", 16), R15) + + MOVQ(Mem{Base: DI}.Offset(0), R8) // h0 + MOVQ(Mem{Base: DI}.Offset(8), R9) // h1 + MOVQ(Mem{Base: DI}.Offset(16), R10) // h2 + MOVQ(Mem{Base: DI}.Offset(24), R11) // r0 + MOVQ(Mem{Base: DI}.Offset(32), R12) // r1 + + CMPQ(R15, Imm(16)) + JB(LabelRef("bytes_between_0_and_15")) + + Label("loop") + POLY1305_ADD(RSI, R8, R9, R10) + + Label("multiply") + POLY1305_MUL(R8, R9, R10, R11, R12, RBX, RCX, R13, R14) + SUBQ(Imm(16), R15) + CMPQ(R15, Imm(16)) + JAE(LabelRef("loop")) + + Label("bytes_between_0_and_15") + TESTQ(R15, R15) + JZ(LabelRef("done")) + MOVQ(U32(1), RBX) + XORQ(RCX, RCX) + XORQ(R13, R13) + ADDQ(R15, RSI) + + Label("flush_buffer") + SHLQ(Imm(8), RBX, RCX) + SHLQ(Imm(8), RBX) + MOVB(Mem{Base: SI}.Offset(-1), R13B) + XORQ(R13, RBX) + DECQ(RSI) + DECQ(R15) + JNZ(LabelRef("flush_buffer")) + + ADDQ(RBX, R8) + ADCQ(RCX, R9) + ADCQ(Imm(0), R10) + MOVQ(U32(16), R15) + JMP(LabelRef("multiply")) + + Label("done") + MOVQ(R8, Mem{Base: DI}.Offset(0)) + MOVQ(R9, Mem{Base: DI}.Offset(8)) + MOVQ(R10, Mem{Base: DI}.Offset(16)) + RET() +} + +func POLY1305_ADD(msg, h0, h1, h2 GPPhysical) { + ADDQ(Mem{Base: msg}.Offset(0), h0) + ADCQ(Mem{Base: msg}.Offset(8), h1) + ADCQ(Imm(1), h2) + LEAQ(Mem{Base: msg}.Offset(16), msg) +} + +func POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3 GPPhysical) { + MOVQ(r0, RAX) + MULQ(h0) + MOVQ(RAX, t0) + MOVQ(RDX, t1) + MOVQ(r0, RAX) + MULQ(h1) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(r0, t2) + IMULQ(h2, t2) + ADDQ(RDX, t2) + + MOVQ(r1, RAX) + MULQ(h0) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, h0) + MOVQ(r1, t3) + IMULQ(h2, t3) + MOVQ(r1, RAX) + MULQ(h1) + ADDQ(RAX, t2) + ADCQ(RDX, t3) + ADDQ(h0, t2) + ADCQ(Imm(0), t3) + + MOVQ(t0, h0) + MOVQ(t1, h1) + MOVQ(t2, h2) + ANDQ(Imm(3), h2) + MOVQ(t2, t0) + ANDQ(I32(-4), t0) + ADDQ(t0, h0) + ADCQ(t3, h1) + ADCQ(Imm(0), h2) + SHRQ(Imm(2), t3, t2) + SHRQ(Imm(2), t3) + ADDQ(t2, h0) + ADCQ(t3, h1) + ADCQ(Imm(0), h2) +} diff --git a/internal/poly1305/sum_amd64.s b/internal/poly1305/sum_amd64.s index e0d3c64756..133757384b 100644 --- a/internal/poly1305/sum_amd64.s +++ b/internal/poly1305/sum_amd64.s @@ -1,108 +1,93 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT. //go:build gc && !purego -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) +// func update(state *macState, msg []byte) TEXT ·update(SB), $0-32 MOVQ state+0(FP), DI MOVQ msg_base+8(FP), SI MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 + MOVQ (DI), R8 + MOVQ 8(DI), R9 + MOVQ 16(DI), R10 + MOVQ 24(DI), R11 + MOVQ 32(DI), R12 + CMPQ R15, $0x10 JB bytes_between_0_and_15 loop: - POLY1305_ADD(SI, R8, R9, R10) + ADDQ (SI), R8 + ADCQ 8(SI), R9 + ADCQ $0x01, R10 + LEAQ 16(SI), SI multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop + MOVQ R11, AX + MULQ R8 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R11, AX + MULQ R9 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ R11, R13 + IMULQ R10, R13 + ADDQ DX, R13 + MOVQ R12, AX + MULQ R8 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R12, R14 + IMULQ R10, R14 + MOVQ R12, AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R14 + ADDQ R8, R13 + ADCQ $0x00, R14 + MOVQ BX, R8 + MOVQ CX, R9 + MOVQ R13, R10 + ANDQ $0x03, R10 + MOVQ R13, BX + ANDQ $-4, BX + ADDQ BX, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SHRQ $0x02, R14, R13 + SHRQ $0x02, R14 + ADDQ R13, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SUBQ $0x10, R15 + CMPQ R15, $0x10 + JAE loop bytes_between_0_and_15: TESTQ R15, R15 JZ done - MOVQ $1, BX + MOVQ $0x00000001, BX XORQ CX, CX XORQ R13, R13 ADDQ R15, SI flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX + SHLQ $0x08, BX, CX + SHLQ $0x08, BX MOVB -1(SI), R13 XORQ R13, BX DECQ SI DECQ R15 JNZ flush_buffer - ADDQ BX, R8 ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 + ADCQ $0x00, R10 + MOVQ $0x00000010, R15 JMP multiply done: - MOVQ R8, 0(DI) + MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) RET From b35ab4fde0e27d900fc800ae12370c858b58ba41 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Wed, 4 Sep 2024 15:05:55 +0000 Subject: [PATCH 77/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I94bb1c6a4bb08aff8c146e84a9d4b3e353f098c2 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/610638 LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Michael Pratt Auto-Submit: Gopher Robot --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index e206587663..d3527d40d7 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.23.0 - golang.org/x/term v0.23.0 + golang.org/x/sys v0.25.0 + golang.org/x/term v0.24.0 ) -require golang.org/x/text v0.17.0 // indirect +require golang.org/x/text v0.18.0 // indirect diff --git a/go.sum b/go.sum index c7231381e6..b347167687 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= -golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= -golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= -golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= -golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM= +golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= From c9da6b9a4008902aae7c754e8f01d42e2d2cf205 Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Wed, 4 Sep 2024 17:15:43 -0400 Subject: [PATCH 78/95] all: fix printf(var) mistakes detected by latest printf checker These were problematic but previously easy to miss. They're now easy to spot thanks to build failures at Go tip as of CL 610736. For golang/go#68796. Change-Id: I167f2cce2376b4070460389c673d973e4521d3dc Reviewed-on: https://go-review.googlesource.com/c/crypto/+/610797 Auto-Submit: Dmitri Shuralyov Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Alan Donovan --- acme/autocert/internal/acmetest/ca.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/acme/autocert/internal/acmetest/ca.go b/acme/autocert/internal/acmetest/ca.go index 0a5ebe7ab7..504a9a0e07 100644 --- a/acme/autocert/internal/acmetest/ca.go +++ b/acme/autocert/internal/acmetest/ca.go @@ -308,7 +308,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { } if err := decodePayload(&req, r.Body); err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } @@ -328,7 +328,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { Identifiers []struct{ Value string } } if err := decodePayload(&req, r.Body); err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } ca.mu.Lock() @@ -352,7 +352,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { defer ca.mu.Unlock() o, err := ca.storedOrder(strings.TrimPrefix(r.URL.Path, "/orders/")) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } if err := json.NewEncoder(w).Encode(o); err != nil { @@ -412,7 +412,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { orderID := strings.TrimPrefix(r.URL.Path, "/new-cert/") o, err := ca.storedOrder(orderID) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } if o.Status != acme.StatusReady { @@ -427,7 +427,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { b, _ := base64.RawURLEncoding.DecodeString(req.CSR) csr, err := x509.ParseCertificateRequest(b) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } // Issue the certificate. @@ -449,7 +449,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { defer ca.mu.Unlock() o, err := ca.storedOrder(strings.TrimPrefix(r.URL.Path, "/issued-cert/")) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } if o.Status != acme.StatusValid { From 9e92970a1eb41e446822e037016aa89d24c0ce7a Mon Sep 17 00:00:00 2001 From: cuishuang Date: Mon, 9 Sep 2024 14:22:39 +0800 Subject: [PATCH 79/95] bn256: add missing symbols in comment Change-Id: Ibd48a070bd8ce35ef5795a8b73bc4ecac43a993e Reviewed-on: https://go-review.googlesource.com/c/crypto/+/611735 Run-TryBot: shuang cui Commit-Queue: Ian Lance Taylor LUCI-TryBot-Result: Go LUCI Auto-Submit: Ian Lance Taylor Reviewed-by: Roland Shoemaker TryBot-Result: Gopher Robot Reviewed-by: Ian Lance Taylor --- bn256/gfp12.go | 2 +- bn256/gfp2.go | 2 +- bn256/gfp6.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bn256/gfp12.go b/bn256/gfp12.go index 2b0151ebcc..b05a8b727f 100644 --- a/bn256/gfp12.go +++ b/bn256/gfp12.go @@ -5,7 +5,7 @@ package bn256 // For details of the algorithms used, see "Multiplication and Squaring on -// Pairing-Friendly Fields, Devegili et al. +// Pairing-Friendly Fields", Devegili et al. // http://eprint.iacr.org/2006/471.pdf. import ( diff --git a/bn256/gfp2.go b/bn256/gfp2.go index 97f3f1f3fa..aa39a3043b 100644 --- a/bn256/gfp2.go +++ b/bn256/gfp2.go @@ -5,7 +5,7 @@ package bn256 // For details of the algorithms used, see "Multiplication and Squaring on -// Pairing-Friendly Fields, Devegili et al. +// Pairing-Friendly Fields", Devegili et al. // http://eprint.iacr.org/2006/471.pdf. import ( diff --git a/bn256/gfp6.go b/bn256/gfp6.go index f98ae782cc..7dec5eabd6 100644 --- a/bn256/gfp6.go +++ b/bn256/gfp6.go @@ -5,7 +5,7 @@ package bn256 // For details of the algorithms used, see "Multiplication and Squaring on -// Pairing-Friendly Fields, Devegili et al. +// Pairing-Friendly Fields", Devegili et al. // http://eprint.iacr.org/2006/471.pdf. import ( From 42ee18b963777d907bbef3e59665cf80968d57e6 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 25 Feb 2024 16:26:56 +0100 Subject: [PATCH 80/95] ssh: return ServerAuthError after too many auth failures if a client is disconnected due to too many authentication attempts we should return a ServerAuthError instead of a generic error. Some users check the error returned by NewServerConn to determine whether or not a client attempted to authenticate. Fixes golang/go#69191 Change-Id: If68fcecdefd6c810fe9df8256b1216e320d8a916 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/566398 Reviewed-by: Filippo Valsorda Reviewed-by: Tim King Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee --- ssh/client_auth_test.go | 25 ++++++++++++++++++------- ssh/server.go | 4 ++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/ssh/client_auth_test.go b/ssh/client_auth_test.go index bf0aa1fe23..e981cc49a6 100644 --- a/ssh/client_auth_test.go +++ b/ssh/client_auth_test.go @@ -641,17 +641,28 @@ func TestClientAuthMaxAuthTries(t *testing.T) { defer c1.Close() defer c2.Close() - go newServer(c1, serverConfig) - _, _, _, err = NewClientConn(c2, "", clientConfig) - if tries > 2 { - if err == nil { + errCh := make(chan error, 1) + + go func() { + _, err := newServer(c1, serverConfig) + errCh <- err + }() + _, _, _, cliErr := NewClientConn(c2, "", clientConfig) + srvErr := <-errCh + + if tries > serverConfig.MaxAuthTries { + if cliErr == nil { t.Fatalf("client: got no error, want %s", expectedErr) - } else if err.Error() != expectedErr.Error() { + } else if cliErr.Error() != expectedErr.Error() { t.Fatalf("client: got %s, want %s", err, expectedErr) } + var authErr *ServerAuthError + if !errors.As(srvErr, &authErr) { + t.Errorf("expected ServerAuthError, got: %v", srvErr) + } } else { - if err != nil { - t.Fatalf("client: got %s, want no error", err) + if cliErr != nil { + t.Fatalf("client: got %s, want no error", cliErr) } } } diff --git a/ssh/server.go b/ssh/server.go index 3ca9e89e22..c0d1c29e6f 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -510,8 +510,8 @@ userAuthLoop: if err := s.transport.writePacket(Marshal(discMsg)); err != nil { return nil, err } - - return nil, discMsg + authErrs = append(authErrs, discMsg) + return nil, &ServerAuthError{Errors: authErrs} } var userAuthReq userAuthRequestMsg From a0819fbb0244af70857f03b6984e1d4f93e6cabf Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 11 Mar 2024 23:44:23 +0000 Subject: [PATCH 81/95] sha3: fix cSHAKE initialization for extremely large N and or S While both impractical and unlikely, the multiplication could overflow on 32-bit architectures. The 64-bit architecture case is unaffected by both the maximum length of Go slices being too small to trigger the overflow (everything except s390), and it being safe to assume no machine has more than 2 EiB of memory. Fixes golang/go#66232 Change-Id: I19c15d42d2d6af35e296697159d43d02f513e614 GitHub-Last-Rev: 503e180debfdc93ab99977172af2b64290cb80e8 GitHub-Pull-Request: golang/crypto#286 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/570876 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase Reviewed-by: Filippo Valsorda Auto-Submit: Filippo Valsorda Reviewed-by: Michael Knyszek --- sha3/shake.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sha3/shake.go b/sha3/shake.go index 1ea9275b8b..a01ef43577 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -85,9 +85,9 @@ func newCShake(N, S []byte, rate, outputLen int, dsbyte byte) ShakeHash { // leftEncode returns max 9 bytes c.initBlock = make([]byte, 0, 9*2+len(N)+len(S)) - c.initBlock = append(c.initBlock, leftEncode(uint64(len(N)*8))...) + c.initBlock = append(c.initBlock, leftEncode(uint64(len(N))*8)...) c.initBlock = append(c.initBlock, N...) - c.initBlock = append(c.initBlock, leftEncode(uint64(len(S)*8))...) + c.initBlock = append(c.initBlock, leftEncode(uint64(len(S))*8)...) c.initBlock = append(c.initBlock, S...) c.Write(bytepad(c.initBlock, c.rate)) return &c From adef4cc1a8c2ca4da1b1f4e6c976b59ca22dbfb8 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Fri, 4 Oct 2024 15:23:27 +0000 Subject: [PATCH 82/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Id321d3b5909ecb66c0311ba86008509c7895863b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/617958 Auto-Submit: Gopher Robot Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index d3527d40d7..ecf61b3e18 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.25.0 - golang.org/x/term v0.24.0 + golang.org/x/sys v0.26.0 + golang.org/x/term v0.25.0 ) -require golang.org/x/text v0.18.0 // indirect +require golang.org/x/text v0.19.0 // indirect diff --git a/go.sum b/go.sum index b347167687..5b12ab8eee 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= -golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM= -golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= -golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= -golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= From 6c2174895805a9c4273cf0052c9ff61ba3e75812 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Wed, 18 Sep 2024 20:36:04 +0000 Subject: [PATCH 83/95] internal/poly1305: extend ppc64le support to ppc64 The cipher needs to load the stream in LE order. Use the byte reversing loads on BE. Also, remove the unused variable poly1305Mask in the PPC64 asm file too. Change-Id: Ie90fe7bb0ea7a3bcb76583e0cf9c1e4133499541 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/614298 Reviewed-by: Michael Knyszek Reviewed-by: Archana Ravindar LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- internal/poly1305/mac_noasm.go | 2 +- .../{sum_ppc64le.go => sum_ppc64x.go} | 2 +- .../poly1305/{sum_ppc64le.s => sum_ppc64x.s} | 30 ++++++++++++------- 3 files changed, 21 insertions(+), 13 deletions(-) rename internal/poly1305/{sum_ppc64le.go => sum_ppc64x.go} (95%) rename internal/poly1305/{sum_ppc64le.s => sum_ppc64x.s} (89%) diff --git a/internal/poly1305/mac_noasm.go b/internal/poly1305/mac_noasm.go index 333da285b3..bd896bdc76 100644 --- a/internal/poly1305/mac_noasm.go +++ b/internal/poly1305/mac_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego +//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego package poly1305 diff --git a/internal/poly1305/sum_ppc64le.go b/internal/poly1305/sum_ppc64x.go similarity index 95% rename from internal/poly1305/sum_ppc64le.go rename to internal/poly1305/sum_ppc64x.go index 4aec4874b5..1a1679aaad 100644 --- a/internal/poly1305/sum_ppc64le.go +++ b/internal/poly1305/sum_ppc64x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) package poly1305 diff --git a/internal/poly1305/sum_ppc64le.s b/internal/poly1305/sum_ppc64x.s similarity index 89% rename from internal/poly1305/sum_ppc64le.s rename to internal/poly1305/sum_ppc64x.s index b3c1699bff..6899a1dabc 100644 --- a/internal/poly1305/sum_ppc64le.s +++ b/internal/poly1305/sum_ppc64x.s @@ -2,15 +2,25 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) #include "textflag.h" // This was ported from the amd64 implementation. +#ifdef GOARCH_ppc64le +#define LE_MOVD MOVD +#define LE_MOVWZ MOVWZ +#define LE_MOVHZ MOVHZ +#else +#define LE_MOVD MOVDBR +#define LE_MOVWZ MOVWBR +#define LE_MOVHZ MOVHBR +#endif + #define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \ - MOVD (msg), t0; \ - MOVD 8(msg), t1; \ + LE_MOVD (msg)( R0), t0; \ + LE_MOVD (msg)(R24), t1; \ MOVD $1, t2; \ ADDC t0, h0, h0; \ ADDE t1, h1, h1; \ @@ -50,10 +60,6 @@ ADDE t3, h1, h1; \ ADDZE h2 -DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -GLOBL ·poly1305Mask<>(SB), RODATA, $16 - // func update(state *[7]uint64, msg []byte) TEXT ·update(SB), $0-32 MOVD state+0(FP), R3 @@ -66,6 +72,8 @@ TEXT ·update(SB), $0-32 MOVD 24(R3), R11 // r0 MOVD 32(R3), R12 // r1 + MOVD $8, R24 + CMP R5, $16 BLT bytes_between_0_and_15 @@ -94,7 +102,7 @@ flush_buffer: // Greater than 8 -- load the rightmost remaining bytes in msg // and put into R17 (h1) - MOVD (R4)(R21), R17 + LE_MOVD (R4)(R21), R17 MOVD $16, R22 // Find the offset to those bytes @@ -118,7 +126,7 @@ just1: BLT less8 // Exactly 8 - MOVD (R4), R16 + LE_MOVD (R4), R16 CMP R17, $0 @@ -133,7 +141,7 @@ less8: MOVD $0, R22 // shift count CMP R5, $4 BLT less4 - MOVWZ (R4), R16 + LE_MOVWZ (R4), R16 ADD $4, R4 ADD $-4, R5 MOVD $32, R22 @@ -141,7 +149,7 @@ less8: less4: CMP R5, $2 BLT less2 - MOVHZ (R4), R21 + LE_MOVHZ (R4), R21 SLD R22, R21, R21 OR R16, R21, R16 ADD $16, R22 From b61b08db44b86a0fb8510036a4655fc4a3d37cd3 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Fri, 13 Sep 2024 19:58:45 +0000 Subject: [PATCH 84/95] chacha20: extend ppc64le support to ppc64 This requires fixing an incorrect save of the counter. It is a word value. It happens to work on LE because length is limited to u32. Refactor the constant table to load correctly independent of byte ordering. Add byte order swapping where output needs converted to LE ordering for storage. Change-Id: Ic7e09bd1c769bb77dd6e817f5a8639ba765f4c0f Reviewed-on: https://go-review.googlesource.com/c/crypto/+/614297 Reviewed-by: Cherry Mui Reviewed-by: Michael Knyszek Reviewed-by: Archana Ravindar LUCI-TryBot-Result: Go LUCI --- chacha20/chacha_noasm.go | 2 +- .../{chacha_ppc64le.go => chacha_ppc64x.go} | 2 +- .../{chacha_ppc64le.s => chacha_ppc64x.s} | 114 +++++++++++++----- 3 files changed, 88 insertions(+), 30 deletions(-) rename chacha20/{chacha_ppc64le.go => chacha_ppc64x.go} (89%) rename chacha20/{chacha_ppc64le.s => chacha_ppc64x.s} (76%) diff --git a/chacha20/chacha_noasm.go b/chacha20/chacha_noasm.go index db42e6676a..c709b72847 100644 --- a/chacha20/chacha_noasm.go +++ b/chacha20/chacha_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego +//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego package chacha20 diff --git a/chacha20/chacha_ppc64le.go b/chacha20/chacha_ppc64x.go similarity index 89% rename from chacha20/chacha_ppc64le.go rename to chacha20/chacha_ppc64x.go index 3a4287f990..bd183d9ba1 100644 --- a/chacha20/chacha_ppc64le.go +++ b/chacha20/chacha_ppc64x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) package chacha20 diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64x.s similarity index 76% rename from chacha20/chacha_ppc64le.s rename to chacha20/chacha_ppc64x.s index c672ccf698..a660b4112f 100644 --- a/chacha20/chacha_ppc64le.s +++ b/chacha20/chacha_ppc64x.s @@ -19,7 +19,7 @@ // The differences in this and the original implementation are // due to the calling conventions and initialization of constants. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) #include "textflag.h" @@ -36,32 +36,68 @@ // for VPERMXOR #define MASK R18 -DATA consts<>+0x00(SB)/8, $0x3320646e61707865 -DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 -DATA consts<>+0x10(SB)/8, $0x0000000000000001 -DATA consts<>+0x18(SB)/8, $0x0000000000000000 -DATA consts<>+0x20(SB)/8, $0x0000000000000004 -DATA consts<>+0x28(SB)/8, $0x0000000000000000 -DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d -DATA consts<>+0x38(SB)/8, $0x0203000106070405 -DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c -DATA consts<>+0x48(SB)/8, $0x0102030005060704 -DATA consts<>+0x50(SB)/8, $0x6170786561707865 -DATA consts<>+0x58(SB)/8, $0x6170786561707865 -DATA consts<>+0x60(SB)/8, $0x3320646e3320646e -DATA consts<>+0x68(SB)/8, $0x3320646e3320646e -DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 -DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 -DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 -DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 -DATA consts<>+0x90(SB)/8, $0x0000000100000000 -DATA consts<>+0x98(SB)/8, $0x0000000300000002 -DATA consts<>+0xa0(SB)/8, $0x5566774411223300 -DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 -DATA consts<>+0xb0(SB)/8, $0x6677445522330011 -DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +DATA consts<>+0x00(SB)/4, $0x61707865 +DATA consts<>+0x04(SB)/4, $0x3320646e +DATA consts<>+0x08(SB)/4, $0x79622d32 +DATA consts<>+0x0c(SB)/4, $0x6b206574 +DATA consts<>+0x10(SB)/4, $0x00000001 +DATA consts<>+0x14(SB)/4, $0x00000000 +DATA consts<>+0x18(SB)/4, $0x00000000 +DATA consts<>+0x1c(SB)/4, $0x00000000 +DATA consts<>+0x20(SB)/4, $0x00000004 +DATA consts<>+0x24(SB)/4, $0x00000000 +DATA consts<>+0x28(SB)/4, $0x00000000 +DATA consts<>+0x2c(SB)/4, $0x00000000 +DATA consts<>+0x30(SB)/4, $0x0e0f0c0d +DATA consts<>+0x34(SB)/4, $0x0a0b0809 +DATA consts<>+0x38(SB)/4, $0x06070405 +DATA consts<>+0x3c(SB)/4, $0x02030001 +DATA consts<>+0x40(SB)/4, $0x0d0e0f0c +DATA consts<>+0x44(SB)/4, $0x090a0b08 +DATA consts<>+0x48(SB)/4, $0x05060704 +DATA consts<>+0x4c(SB)/4, $0x01020300 +DATA consts<>+0x50(SB)/4, $0x61707865 +DATA consts<>+0x54(SB)/4, $0x61707865 +DATA consts<>+0x58(SB)/4, $0x61707865 +DATA consts<>+0x5c(SB)/4, $0x61707865 +DATA consts<>+0x60(SB)/4, $0x3320646e +DATA consts<>+0x64(SB)/4, $0x3320646e +DATA consts<>+0x68(SB)/4, $0x3320646e +DATA consts<>+0x6c(SB)/4, $0x3320646e +DATA consts<>+0x70(SB)/4, $0x79622d32 +DATA consts<>+0x74(SB)/4, $0x79622d32 +DATA consts<>+0x78(SB)/4, $0x79622d32 +DATA consts<>+0x7c(SB)/4, $0x79622d32 +DATA consts<>+0x80(SB)/4, $0x6b206574 +DATA consts<>+0x84(SB)/4, $0x6b206574 +DATA consts<>+0x88(SB)/4, $0x6b206574 +DATA consts<>+0x8c(SB)/4, $0x6b206574 +DATA consts<>+0x90(SB)/4, $0x00000000 +DATA consts<>+0x94(SB)/4, $0x00000001 +DATA consts<>+0x98(SB)/4, $0x00000002 +DATA consts<>+0x9c(SB)/4, $0x00000003 +DATA consts<>+0xa0(SB)/4, $0x11223300 +DATA consts<>+0xa4(SB)/4, $0x55667744 +DATA consts<>+0xa8(SB)/4, $0x99aabb88 +DATA consts<>+0xac(SB)/4, $0xddeeffcc +DATA consts<>+0xb0(SB)/4, $0x22330011 +DATA consts<>+0xb4(SB)/4, $0x66774455 +DATA consts<>+0xb8(SB)/4, $0xaabb8899 +DATA consts<>+0xbc(SB)/4, $0xeeffccdd GLOBL consts<>(SB), RODATA, $0xc0 +#ifdef GOARCH_ppc64 +#define BE_XXBRW_INIT() \ + LVSL (R0)(R0), V24 \ + VSPLTISB $3, V25 \ + VXOR V24, V25, V24 \ + +#define BE_XXBRW(vr) VPERM vr, vr, V24, vr +#else +#define BE_XXBRW_INIT() +#define BE_XXBRW(vr) +#endif + //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD out+0(FP), OUT @@ -94,6 +130,8 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // Clear V27 VXOR V27, V27, V27 + BE_XXBRW_INIT() + // V28 LXVW4X (CONSTBASE)(R11), VS60 @@ -299,6 +337,11 @@ loop_vsx: VADDUWM V8, V18, V8 VADDUWM V12, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -327,6 +370,11 @@ loop_vsx: VADDUWM V9, V18, V8 VADDUWM V13, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -334,8 +382,8 @@ loop_vsx: LXVW4X (INP)(R8), VS60 LXVW4X (INP)(R9), VS61 LXVW4X (INP)(R10), VS62 - VXOR V27, V0, V27 + VXOR V27, V0, V27 VXOR V28, V4, V28 VXOR V29, V8, V29 VXOR V30, V12, V30 @@ -354,6 +402,11 @@ loop_vsx: VADDUWM V10, V18, V8 VADDUWM V14, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -381,6 +434,11 @@ loop_vsx: VADDUWM V11, V18, V8 VADDUWM V15, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -408,9 +466,9 @@ loop_vsx: done_vsx: // Increment counter by number of 64 byte blocks - MOVD (CNT), R14 + MOVWZ (CNT), R14 ADD BLOCKS, R14 - MOVD R14, (CNT) + MOVWZ R14, (CNT) RET tail_vsx: From 7cfb9161e8d828fd6d9f34560e78460435b63503 Mon Sep 17 00:00:00 2001 From: samiponkanen Date: Wed, 16 Oct 2024 01:53:41 +0000 Subject: [PATCH 85/95] ssh: return unexpected msg error when server fails keyboard-interactive auth early Seems the OpenSSH server running on windows fails keyboard-interactive auth this way without sending any prompt to client. In such case the golang ssh client should not retry keyboard-interactive auth when the auth method is wrapped in a RetryableAuthMethod(). Rather the auth method should be immediately marked as tried&failed and the client auth process should move on to next available and acceptable auth method. Fixes golang/go#67855 Change-Id: I6c64ae58ff8325774e37af716601b112f8833d8f GitHub-Last-Rev: 7fafc4d1c81284b31000d7d6ccadd934dda26d24 GitHub-Pull-Request: golang/crypto#297 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/590956 LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Ian Lance Taylor Auto-Submit: Nicola Murino Reviewed-by: Nicola Murino --- ssh/client_auth.go | 5 +++ ssh/client_auth_test.go | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/ssh/client_auth.go b/ssh/client_auth.go index b93961010d..b86dde151d 100644 --- a/ssh/client_auth.go +++ b/ssh/client_auth.go @@ -555,6 +555,7 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe } gotMsgExtInfo := false + gotUserAuthInfoRequest := false for { packet, err := c.readPacket() if err != nil { @@ -585,6 +586,9 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe if msg.PartialSuccess { return authPartialSuccess, msg.Methods, nil } + if !gotUserAuthInfoRequest { + return authFailure, msg.Methods, unexpectedMessageError(msgUserAuthInfoRequest, packet[0]) + } return authFailure, msg.Methods, nil case msgUserAuthSuccess: return authSuccess, nil, nil @@ -596,6 +600,7 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe if err := Unmarshal(packet, &msg); err != nil { return authFailure, nil, err } + gotUserAuthInfoRequest = true // Manually unpack the prompt/echo pairs. rest := msg.Prompts diff --git a/ssh/client_auth_test.go b/ssh/client_auth_test.go index e981cc49a6..f11eeb590b 100644 --- a/ssh/client_auth_test.go +++ b/ssh/client_auth_test.go @@ -1293,3 +1293,92 @@ func TestCertAuthOpenSSHCompat(t *testing.T) { t.Fatalf("unable to dial remote side: %s", err) } } + +func TestKeyboardInteractiveAuthEarlyFail(t *testing.T) { + const maxAuthTries = 2 + + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + + // Start testserver + serverConfig := &ServerConfig{ + MaxAuthTries: maxAuthTries, + KeyboardInteractiveCallback: func(c ConnMetadata, + client KeyboardInteractiveChallenge) (*Permissions, error) { + // Fail keyboard-interactive authentication early before + // any prompt is sent to client. + return nil, errors.New("keyboard-interactive auth failed") + }, + PasswordCallback: func(c ConnMetadata, + pass []byte) (*Permissions, error) { + if string(pass) == clientPassword { + return nil, nil + } + return nil, errors.New("password auth failed") + }, + } + serverConfig.AddHostKey(testSigners["rsa"]) + + serverDone := make(chan struct{}) + go func() { + defer func() { serverDone <- struct{}{} }() + conn, chans, reqs, err := NewServerConn(c2, serverConfig) + if err != nil { + return + } + _ = conn.Close() + + discarderDone := make(chan struct{}) + go func() { + defer func() { discarderDone <- struct{}{} }() + DiscardRequests(reqs) + }() + for newChannel := range chans { + newChannel.Reject(Prohibited, + "testserver not accepting requests") + } + + <-discarderDone + }() + + // Connect to testserver, expect KeyboardInteractive() to be not called, + // PasswordCallback() to be called and connection to succeed. + passwordCallbackCalled := false + clientConfig := &ClientConfig{ + User: "testuser", + Auth: []AuthMethod{ + RetryableAuthMethod(KeyboardInteractive(func(name, + instruction string, questions []string, + echos []bool) ([]string, error) { + t.Errorf("unexpected call to KeyboardInteractive()") + return []string{clientPassword}, nil + }), maxAuthTries), + RetryableAuthMethod(PasswordCallback(func() (secret string, + err error) { + t.Logf("PasswordCallback()") + passwordCallbackCalled = true + return clientPassword, nil + }), maxAuthTries), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + conn, _, _, err := NewClientConn(c1, "", clientConfig) + if err != nil { + t.Errorf("unexpected NewClientConn() error: %v", err) + } + if conn != nil { + conn.Close() + } + + // Wait for server to finish. + <-serverDone + + if !passwordCallbackCalled { + t.Errorf("expected PasswordCallback() to be called") + } +} From c17aa50fbd32393e5d52fa65ca51cbfff0a75aea Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 29 Sep 2024 23:27:16 +0200 Subject: [PATCH 86/95] sha3: avoid buffer copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the package worked by copying the input (or the output) into a buffer, and then XOR'ing (or copying) it into (or out of) the state. (Except for an input fast path.) There's no need for that! We can XOR straight into the state, and copy straight out of it, at least on little endian machines. This is a bit faster, almost halves the state size, and will make it easier to implement marshaling, but most importantly look at how much simpler it makes the code! go: go1.23.0 goos: linux goarch: amd64 pkg: golang.org/x/crypto/sha3 cpu: AMD Ryzen 7 PRO 8700GE w/ Radeon 780M Graphics │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ sec/op │ sec/op vs base │ PermutationFunction-8 270.8n ± 0% 270.4n ± 0% ~ (p=0.099 n=10) Sha3_512_MTU-8 5.762µ ± 0% 5.658µ ± 0% -1.80% (p=0.000 n=10) Sha3_384_MTU-8 4.179µ ± 0% 4.070µ ± 0% -2.60% (p=0.000 n=10) Sha3_256_MTU-8 3.316µ ± 0% 3.214µ ± 0% -3.08% (p=0.000 n=10) Sha3_224_MTU-8 3.175µ ± 0% 3.061µ ± 0% -3.61% (p=0.000 n=10) Shake128_MTU-8 2.779µ ± 0% 2.681µ ± 0% -3.51% (p=0.000 n=10) Shake256_MTU-8 2.947µ ± 0% 2.957µ ± 0% +0.32% (p=0.000 n=10) Shake256_16x-8 44.15µ ± 0% 44.45µ ± 0% +0.67% (p=0.000 n=10) Shake256_1MiB-8 2.319m ± 0% 2.274m ± 0% -1.93% (p=0.000 n=10) Sha3_512_1MiB-8 4.204m ± 0% 4.219m ± 0% +0.34% (p=0.000 n=10) geomean 13.75µ 13.54µ -1.55% │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ B/s │ B/s vs base │ PermutationFunction-8 704.3Mi ± 0% 705.4Mi ± 0% ~ (p=0.105 n=10) Sha3_512_MTU-8 223.5Mi ± 0% 227.6Mi ± 0% +1.83% (p=0.000 n=10) Sha3_384_MTU-8 308.1Mi ± 0% 316.4Mi ± 0% +2.67% (p=0.000 n=10) Sha3_256_MTU-8 388.2Mi ± 0% 400.5Mi ± 0% +3.17% (p=0.000 n=10) Sha3_224_MTU-8 405.5Mi ± 0% 420.7Mi ± 0% +3.73% (p=0.000 n=10) Shake128_MTU-8 463.4Mi ± 0% 480.2Mi ± 0% +3.64% (p=0.000 n=10) Shake256_MTU-8 436.9Mi ± 0% 435.5Mi ± 0% -0.32% (p=0.000 n=10) Shake256_16x-8 353.9Mi ± 0% 351.5Mi ± 0% -0.66% (p=0.000 n=10) Shake256_1MiB-8 431.2Mi ± 0% 439.7Mi ± 0% +1.97% (p=0.000 n=10) Sha3_512_1MiB-8 237.8Mi ± 0% 237.1Mi ± 0% -0.33% (p=0.000 n=10) geomean 375.7Mi 381.6Mi +1.57% Even stronger effect when patched on top of CL 616555 (forced on). go: go1.23.0 goos: darwin goarch: arm64 pkg: golang.org/x/crypto/sha3 cpu: Apple M2 │ old │ new │ │ sec/op │ sec/op vs base │ PermutationFunction-8 154.7n ± 2% 153.8n ± 1% ~ (p=0.469 n=10) Sha3_512_MTU-8 3.260µ ± 2% 3.143µ ± 2% -3.60% (p=0.000 n=10) Sha3_384_MTU-8 2.389µ ± 2% 2.244µ ± 2% -6.07% (p=0.000 n=10) Sha3_256_MTU-8 1.950µ ± 2% 1.758µ ± 1% -9.87% (p=0.000 n=10) Sha3_224_MTU-8 1.874µ ± 2% 1.686µ ± 1% -10.06% (p=0.000 n=10) Shake128_MTU-8 1.827µ ± 3% 1.447µ ± 1% -20.80% (p=0.000 n=10) Shake256_MTU-8 1.665µ ± 3% 1.604µ ± 3% -3.63% (p=0.003 n=10) Shake256_16x-8 25.14µ ± 1% 25.23µ ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 1.236m ± 2% 1.243m ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 2.296m ± 2% 2.305m ± 1% ~ (p=0.315 n=10) geomean 7.906µ 7.467µ -5.56% │ old │ new │ │ B/op │ B/op vs base │ PermutationFunction-8 1.204Gi ± 2% 1.212Gi ± 1% ~ (p=0.529 n=10) Sha3_512_MTU-8 394.9Mi ± 2% 409.7Mi ± 2% +3.73% (p=0.000 n=10) Sha3_384_MTU-8 539.0Mi ± 2% 573.8Mi ± 2% +6.45% (p=0.000 n=10) Sha3_256_MTU-8 660.3Mi ± 2% 732.6Mi ± 1% +10.95% (p=0.000 n=10) Sha3_224_MTU-8 687.1Mi ± 2% 763.9Mi ± 1% +11.17% (p=0.000 n=10) Shake128_MTU-8 704.7Mi ± 2% 889.6Mi ± 2% +26.24% (p=0.000 n=10) Shake256_MTU-8 773.4Mi ± 3% 802.5Mi ± 3% +3.76% (p=0.004 n=10) Shake256_16x-8 621.6Mi ± 1% 619.3Mi ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 809.1Mi ± 2% 804.7Mi ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 435.6Mi ± 2% 433.9Mi ± 1% ~ (p=0.315 n=10) geomean 653.6Mi 692.0Mi +5.88% Change-Id: I33a0a1ddf305c395f99bf17f81473e2f42c5ce42 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616575 Reviewed-by: Daniel McCarney Reviewed-by: Michael Pratt Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda Reviewed-by: Andrew Ekstedt --- sha3/sha3.go | 113 +++++++++++++++++++++++---------------------------- sha3/xor.go | 40 ------------------ 2 files changed, 50 insertions(+), 103 deletions(-) delete mode 100644 sha3/xor.go diff --git a/sha3/sha3.go b/sha3/sha3.go index afedde5abf..bda574e9d6 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -4,6 +4,14 @@ package sha3 +import ( + "crypto/subtle" + "encoding/binary" + "unsafe" + + "golang.org/x/sys/cpu" +) + // spongeDirection indicates the direction bytes are flowing through the sponge. type spongeDirection int @@ -14,16 +22,13 @@ const ( spongeSqueezing ) -const ( - // maxRate is the maximum size of the internal buffer. SHAKE-256 - // currently needs the largest buffer. - maxRate = 168 -) - type state struct { - // Generic sponge components. - a [25]uint64 // main state of the hash - rate int // the number of bytes of state to use + a [1600 / 8]byte // main state of the hash + + // a[n:rate] is the buffer. If absorbing, it's the remaining space to XOR + // into before running the permutation. If squeezing, it's the remaining + // output to produce before running the permutation. + n, rate int // dsbyte contains the "domain separation" bits and the first bit of // the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the @@ -39,10 +44,6 @@ type state struct { // Extendable-Output Functions (May 2014)" dsbyte byte - i, n int // storage[i:n] is the buffer, i is only used while squeezing - storage [maxRate]byte - - // Specific to SHA-3 and SHAKE. outputLen int // the default output size in bytes state spongeDirection // whether the sponge is absorbing or squeezing } @@ -61,7 +62,7 @@ func (d *state) Reset() { d.a[i] = 0 } d.state = spongeAbsorbing - d.i, d.n = 0, 0 + d.n = 0 } func (d *state) clone() *state { @@ -69,22 +70,25 @@ func (d *state) clone() *state { return &ret } -// permute applies the KeccakF-1600 permutation. It handles -// any input-output buffering. +// permute applies the KeccakF-1600 permutation. func (d *state) permute() { - switch d.state { - case spongeAbsorbing: - // If we're absorbing, we need to xor the input into the state - // before applying the permutation. - xorIn(d, d.storage[:d.rate]) - d.n = 0 - keccakF1600(&d.a) - case spongeSqueezing: - // If we're squeezing, we need to apply the permutation before - // copying more output. - keccakF1600(&d.a) - d.i = 0 - copyOut(d, d.storage[:d.rate]) + var a *[25]uint64 + if cpu.IsBigEndian { + a = new([25]uint64) + for i := range a { + a[i] = binary.LittleEndian.Uint64(d.a[i*8:]) + } + } else { + a = (*[25]uint64)(unsafe.Pointer(&d.a)) + } + + keccakF1600(a) + d.n = 0 + + if cpu.IsBigEndian { + for i := range a { + binary.LittleEndian.PutUint64(d.a[i*8:], a[i]) + } } } @@ -92,53 +96,36 @@ func (d *state) permute() { // the multi-bitrate 10..1 padding rule, and permutes the state. func (d *state) padAndPermute() { // Pad with this instance's domain-separator bits. We know that there's - // at least one byte of space in d.buf because, if it were full, + // at least one byte of space in the sponge because, if it were full, // permute would have been called to empty it. dsbyte also contains the // first one bit for the padding. See the comment in the state struct. - d.storage[d.n] = d.dsbyte - d.n++ - for d.n < d.rate { - d.storage[d.n] = 0 - d.n++ - } + d.a[d.n] ^= d.dsbyte // This adds the final one bit for the padding. Because of the way that // bits are numbered from the LSB upwards, the final bit is the MSB of // the last byte. - d.storage[d.rate-1] ^= 0x80 + d.a[d.rate-1] ^= 0x80 // Apply the permutation d.permute() d.state = spongeSqueezing - d.n = d.rate - copyOut(d, d.storage[:d.rate]) } // Write absorbs more data into the hash's state. It panics if any // output has already been read. -func (d *state) Write(p []byte) (written int, err error) { +func (d *state) Write(p []byte) (n int, err error) { if d.state != spongeAbsorbing { panic("sha3: Write after Read") } - written = len(p) + + n = len(p) for len(p) > 0 { - if d.n == 0 && len(p) >= d.rate { - // The fast path; absorb a full "rate" bytes of input and apply the permutation. - xorIn(d, p[:d.rate]) - p = p[d.rate:] - keccakF1600(&d.a) - } else { - // The slow path; buffer the input until we can fill the sponge, and then xor it in. - todo := d.rate - d.n - if todo > len(p) { - todo = len(p) - } - d.n += copy(d.storage[d.n:], p[:todo]) - p = p[todo:] - - // If the sponge is full, apply the permutation. - if d.n == d.rate { - d.permute() - } + x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p) + d.n += x + p = p[x:] + + // If the sponge is full, apply the permutation. + if d.n == d.rate { + d.permute() } } @@ -156,12 +143,12 @@ func (d *state) Read(out []byte) (n int, err error) { // Now, do the squeezing. for len(out) > 0 { - n := copy(out, d.storage[d.i:d.n]) - d.i += n - out = out[n:] + x := copy(out, d.a[d.n:d.rate]) + d.n += x + out = out[x:] // Apply the permutation if we've squeezed the sponge dry. - if d.i == d.rate { + if d.n == d.rate { d.permute() } } diff --git a/sha3/xor.go b/sha3/xor.go deleted file mode 100644 index 6ada5c9574..0000000000 --- a/sha3/xor.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package sha3 - -import ( - "crypto/subtle" - "encoding/binary" - "unsafe" - - "golang.org/x/sys/cpu" -) - -// xorIn xors the bytes in buf into the state. -func xorIn(d *state, buf []byte) { - if cpu.IsBigEndian { - for i := 0; len(buf) >= 8; i++ { - a := binary.LittleEndian.Uint64(buf) - d.a[i] ^= a - buf = buf[8:] - } - } else { - ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) - subtle.XORBytes(ab[:], ab[:], buf) - } -} - -// copyOut copies uint64s to a byte buffer. -func copyOut(d *state, b []byte) { - if cpu.IsBigEndian { - for i := 0; len(b) >= 8; i++ { - binary.LittleEndian.PutUint64(b, d.a[i]) - b = b[8:] - } - } else { - ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) - copy(b, ab[:]) - } -} From 80ea76eb17c0c52f5d5d04e833d6aeb6b062d81d Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 30 Sep 2024 00:57:48 +0200 Subject: [PATCH 87/95] sha3: fix padding for long cSHAKE parameters We used to compute the incorrect value if len(initBlock) % rate == 0. Also, add a test vector for golang/go#66232, confirmed to fail on GOARCH=386 without CL 570876. Fixes golang/go#69169 Change-Id: I3f2400926fca111dd0ca1327d6b5975e51b28f96 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616576 Reviewed-by: Andrew Ekstedt Reviewed-by: Daniel McCarney Reviewed-by: Michael Pratt LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda Reviewed-by: Roland Shoemaker --- sha3/sha3_test.go | 111 ++++++++++++++++++++++++++++++++++++++++++++++ sha3/shake.go | 45 ++++++++++--------- 2 files changed, 134 insertions(+), 22 deletions(-) diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index 21e8cbad7b..8347b60d10 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -17,6 +17,7 @@ import ( "encoding/json" "fmt" "hash" + "io" "math/rand" "os" "strings" @@ -375,6 +376,116 @@ func TestClone(t *testing.T) { } } +func TestCSHAKEAccumulated(t *testing.T) { + // Generated with pycryptodome@3.20.0 + // + // from Crypto.Hash import cSHAKE128 + // rng = cSHAKE128.new() + // acc = cSHAKE128.new() + // for n in range(200): + // N = rng.read(n) + // for s in range(200): + // S = rng.read(s) + // c = cSHAKE128.cSHAKE_XOF(data=None, custom=S, capacity=256, function=N) + // c.update(rng.read(100)) + // acc.update(c.read(200)) + // c = cSHAKE128.cSHAKE_XOF(data=None, custom=S, capacity=256, function=N) + // c.update(rng.read(168)) + // acc.update(c.read(200)) + // c = cSHAKE128.cSHAKE_XOF(data=None, custom=S, capacity=256, function=N) + // c.update(rng.read(200)) + // acc.update(c.read(200)) + // print(acc.read(32).hex()) + // + // and with @noble/hashes@v1.5.0 + // + // import { bytesToHex } from "@noble/hashes/utils"; + // import { cshake128 } from "@noble/hashes/sha3-addons"; + // const rng = cshake128.create(); + // const acc = cshake128.create(); + // for (let n = 0; n < 200; n++) { + // const N = rng.xof(n); + // for (let s = 0; s < 200; s++) { + // const S = rng.xof(s); + // let c = cshake128.create({ NISTfn: N, personalization: S }); + // c.update(rng.xof(100)); + // acc.update(c.xof(200)); + // c = cshake128.create({ NISTfn: N, personalization: S }); + // c.update(rng.xof(168)); + // acc.update(c.xof(200)); + // c = cshake128.create({ NISTfn: N, personalization: S }); + // c.update(rng.xof(200)); + // acc.update(c.xof(200)); + // } + // } + // console.log(bytesToHex(acc.xof(32))); + // + t.Run("cSHAKE128", func(t *testing.T) { + testCSHAKEAccumulated(t, NewCShake128, rate128, + "bb14f8657c6ec5403d0b0e2ef3d3393497e9d3b1a9a9e8e6c81dbaa5fd809252") + }) + t.Run("cSHAKE256", func(t *testing.T) { + testCSHAKEAccumulated(t, NewCShake256, rate256, + "0baaf9250c6e25f0c14ea5c7f9bfde54c8a922c8276437db28f3895bdf6eeeef") + }) +} + +func testCSHAKEAccumulated(t *testing.T, newCShake func(N, S []byte) ShakeHash, rate int64, exp string) { + rnd := newCShake(nil, nil) + acc := newCShake(nil, nil) + for n := 0; n < 200; n++ { + N := make([]byte, n) + rnd.Read(N) + for s := 0; s < 200; s++ { + S := make([]byte, s) + rnd.Read(S) + + c := newCShake(N, S) + io.CopyN(c, rnd, 100 /* < rate */) + io.CopyN(acc, c, 200) + + c.Reset() + io.CopyN(c, rnd, rate) + io.CopyN(acc, c, 200) + + c.Reset() + io.CopyN(c, rnd, 200 /* > rate */) + io.CopyN(acc, c, 200) + } + } + if got := hex.EncodeToString(acc.Sum(nil)[:32]); got != exp { + t.Errorf("got %s, want %s", got, exp) + } +} + +func TestCSHAKELargeS(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + // See https://go.dev/issue/66232. + const s = (1<<32)/8 + 1000 // s * 8 > 2^32 + S := make([]byte, s) + rnd := NewShake128() + rnd.Read(S) + c := NewCShake128(nil, S) + io.CopyN(c, rnd, 1000) + + // Generated with pycryptodome@3.20.0 + // + // from Crypto.Hash import cSHAKE128 + // rng = cSHAKE128.new() + // S = rng.read(536871912) + // c = cSHAKE128.new(custom=S) + // c.update(rng.read(1000)) + // print(c.read(32).hex()) + // + exp := "2cb9f237767e98f2614b8779cf096a52da9b3a849280bbddec820771ae529cf0" + if got := hex.EncodeToString(c.Sum(nil)); got != exp { + t.Errorf("got %s, want %s", got, exp) + } +} + // BenchmarkPermutationFunction measures the speed of the permutation function // with no input data. func BenchmarkPermutationFunction(b *testing.B) { diff --git a/sha3/shake.go b/sha3/shake.go index a01ef43577..6d75811fd4 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -19,6 +19,7 @@ import ( "encoding/binary" "hash" "io" + "math/bits" ) // ShakeHash defines the interface to hash functions that support @@ -58,33 +59,33 @@ const ( rate256 = 136 ) -func bytepad(input []byte, w int) []byte { - // leftEncode always returns max 9 bytes - buf := make([]byte, 0, 9+len(input)+w) - buf = append(buf, leftEncode(uint64(w))...) - buf = append(buf, input...) - padlen := w - (len(buf) % w) - return append(buf, make([]byte, padlen)...) -} - -func leftEncode(value uint64) []byte { - var b [9]byte - binary.BigEndian.PutUint64(b[1:], value) - // Trim all but last leading zero bytes - i := byte(1) - for i < 8 && b[i] == 0 { - i++ +func bytepad(data []byte, rate int) []byte { + out := make([]byte, 0, 9+len(data)+rate-1) + out = append(out, leftEncode(uint64(rate))...) + out = append(out, data...) + if padlen := rate - len(out)%rate; padlen < rate { + out = append(out, make([]byte, padlen)...) } - // Prepend number of encoded bytes - b[i-1] = 9 - i - return b[i-1:] + return out +} + +func leftEncode(x uint64) []byte { + // Let n be the smallest positive integer for which 2^(8n) > x. + n := (bits.Len64(x) + 7) / 8 + if n == 0 { + n = 1 + } + // Return n || x with n as a byte and x an n bytes in big-endian order. + b := make([]byte, 9) + binary.BigEndian.PutUint64(b[1:], x) + b = b[9-n-1:] + b[0] = byte(n) + return b } func newCShake(N, S []byte, rate, outputLen int, dsbyte byte) ShakeHash { c := cshakeState{state: &state{rate: rate, outputLen: outputLen, dsbyte: dsbyte}} - - // leftEncode returns max 9 bytes - c.initBlock = make([]byte, 0, 9*2+len(N)+len(S)) + c.initBlock = make([]byte, 0, 9+len(N)+9+len(S)) // leftEncode returns max 9 bytes c.initBlock = append(c.initBlock, leftEncode(uint64(len(N))*8)...) c.initBlock = append(c.initBlock, N...) c.initBlock = append(c.initBlock, leftEncode(uint64(len(S))*8)...) From 36b172546bd03a74c79e109ec84c599b672ea9e4 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Wed, 2 Oct 2024 12:44:13 +0200 Subject: [PATCH 88/95] sha3: avoid trailing permutation If you read a multiple of the rate, and then stop, there is no point in running the final permutation. Change-Id: Ic95e70f78b6e139aca1d3e3c11e09d2bbcf54f6c Reviewed-on: https://go-review.googlesource.com/c/crypto/+/620555 Reviewed-by: Daniel McCarney Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt Auto-Submit: Filippo Valsorda --- sha3/sha3.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sha3/sha3.go b/sha3/sha3.go index bda574e9d6..4f5caddb01 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -143,14 +143,14 @@ func (d *state) Read(out []byte) (n int, err error) { // Now, do the squeezing. for len(out) > 0 { - x := copy(out, d.a[d.n:d.rate]) - d.n += x - out = out[x:] - // Apply the permutation if we've squeezed the sponge dry. if d.n == d.rate { d.permute() } + + x := copy(out, d.a[d.n:d.rate]) + d.n += x + out = out[x:] } return From 750a45fe5e473d5afa193e9088f3d135e64eca26 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 30 Sep 2024 13:39:09 +0200 Subject: [PATCH 89/95] sha3: add MarshalBinary, AppendBinary, and UnmarshalBinary Fixes golang/go#24617 Change-Id: I1d9d529950aa8a5953435e8d3412cda44b075d55 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616635 Reviewed-by: Roland Shoemaker Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Daniel McCarney Reviewed-by: Michael Pratt --- sha3/doc.go | 4 +++ sha3/hashes.go | 31 ++++++++++++++++---- sha3/sha3.go | 72 +++++++++++++++++++++++++++++++++++++++++++++++ sha3/sha3_test.go | 42 +++++++++++++++++++++++++-- sha3/shake.go | 42 +++++++++++++++++++-------- 5 files changed, 171 insertions(+), 20 deletions(-) diff --git a/sha3/doc.go b/sha3/doc.go index 7e02309070..bbf391fe6e 100644 --- a/sha3/doc.go +++ b/sha3/doc.go @@ -5,6 +5,10 @@ // Package sha3 implements the SHA-3 fixed-output-length hash functions and // the SHAKE variable-output-length hash functions defined by FIPS-202. // +// All types in this package also implement [encoding.BinaryMarshaler], +// [encoding.BinaryAppender] and [encoding.BinaryUnmarshaler] to marshal and +// unmarshal the internal state of the hash. +// // Both types of hash function use the "sponge" construction and the Keccak // permutation. For a detailed specification see http://keccak.noekeon.org/ // diff --git a/sha3/hashes.go b/sha3/hashes.go index c544b29e5f..31fffbe044 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -48,33 +48,52 @@ func init() { crypto.RegisterHash(crypto.SHA3_512, New512) } +const ( + dsbyteSHA3 = 0b00000110 + dsbyteKeccak = 0b00000001 + dsbyteShake = 0b00011111 + dsbyteCShake = 0b00000100 + + // rateK[c] is the rate in bytes for Keccak[c] where c is the capacity in + // bits. Given the sponge size is 1600 bits, the rate is 1600 - c bits. + rateK256 = (1600 - 256) / 8 + rateK448 = (1600 - 448) / 8 + rateK512 = (1600 - 512) / 8 + rateK768 = (1600 - 768) / 8 + rateK1024 = (1600 - 1024) / 8 +) + func new224Generic() *state { - return &state{rate: 144, outputLen: 28, dsbyte: 0x06} + return &state{rate: rateK448, outputLen: 28, dsbyte: dsbyteSHA3} } func new256Generic() *state { - return &state{rate: 136, outputLen: 32, dsbyte: 0x06} + return &state{rate: rateK512, outputLen: 32, dsbyte: dsbyteSHA3} } func new384Generic() *state { - return &state{rate: 104, outputLen: 48, dsbyte: 0x06} + return &state{rate: rateK768, outputLen: 48, dsbyte: dsbyteSHA3} } func new512Generic() *state { - return &state{rate: 72, outputLen: 64, dsbyte: 0x06} + return &state{rate: rateK1024, outputLen: 64, dsbyte: dsbyteSHA3} } // NewLegacyKeccak256 creates a new Keccak-256 hash. // // Only use this function if you require compatibility with an existing cryptosystem // that uses non-standard padding. All other users should use New256 instead. -func NewLegacyKeccak256() hash.Hash { return &state{rate: 136, outputLen: 32, dsbyte: 0x01} } +func NewLegacyKeccak256() hash.Hash { + return &state{rate: rateK512, outputLen: 32, dsbyte: dsbyteKeccak} +} // NewLegacyKeccak512 creates a new Keccak-512 hash. // // Only use this function if you require compatibility with an existing cryptosystem // that uses non-standard padding. All other users should use New512 instead. -func NewLegacyKeccak512() hash.Hash { return &state{rate: 72, outputLen: 64, dsbyte: 0x01} } +func NewLegacyKeccak512() hash.Hash { + return &state{rate: rateK1024, outputLen: 64, dsbyte: dsbyteKeccak} +} // Sum224 returns the SHA3-224 digest of the data. func Sum224(data []byte) (digest [28]byte) { diff --git a/sha3/sha3.go b/sha3/sha3.go index 4f5caddb01..6658c44479 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -7,6 +7,7 @@ package sha3 import ( "crypto/subtle" "encoding/binary" + "errors" "unsafe" "golang.org/x/sys/cpu" @@ -170,3 +171,74 @@ func (d *state) Sum(in []byte) []byte { dup.Read(hash) return append(in, hash...) } + +const ( + magicSHA3 = "sha\x08" + magicShake = "sha\x09" + magicCShake = "sha\x0a" + magicKeccak = "sha\x0b" + // magic || rate || main state || n || sponge direction + marshaledSize = len(magicSHA3) + 1 + 200 + 1 + 1 +) + +func (d *state) MarshalBinary() ([]byte, error) { + return d.AppendBinary(make([]byte, 0, marshaledSize)) +} + +func (d *state) AppendBinary(b []byte) ([]byte, error) { + switch d.dsbyte { + case dsbyteSHA3: + b = append(b, magicSHA3...) + case dsbyteShake: + b = append(b, magicShake...) + case dsbyteCShake: + b = append(b, magicCShake...) + case dsbyteKeccak: + b = append(b, magicKeccak...) + default: + panic("unknown dsbyte") + } + // rate is at most 168, and n is at most rate. + b = append(b, byte(d.rate)) + b = append(b, d.a[:]...) + b = append(b, byte(d.n), byte(d.state)) + return b, nil +} + +func (d *state) UnmarshalBinary(b []byte) error { + if len(b) != marshaledSize { + return errors.New("sha3: invalid hash state") + } + + magic := string(b[:len(magicSHA3)]) + b = b[len(magicSHA3):] + switch { + case magic == magicSHA3 && d.dsbyte == dsbyteSHA3: + case magic == magicShake && d.dsbyte == dsbyteShake: + case magic == magicCShake && d.dsbyte == dsbyteCShake: + case magic == magicKeccak && d.dsbyte == dsbyteKeccak: + default: + return errors.New("sha3: invalid hash state identifier") + } + + rate := int(b[0]) + b = b[1:] + if rate != d.rate { + return errors.New("sha3: invalid hash state function") + } + + copy(d.a[:], b) + b = b[len(d.a):] + + n, state := int(b[0]), spongeDirection(b[1]) + if n > d.rate { + return errors.New("sha3: invalid hash state") + } + d.n = n + if state != spongeAbsorbing && state != spongeSqueezing { + return errors.New("sha3: invalid hash state") + } + d.state = state + + return nil +} diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index 8347b60d10..d97a970b1a 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -13,6 +13,7 @@ package sha3 import ( "bytes" "compress/flate" + "encoding" "encoding/hex" "encoding/json" "fmt" @@ -421,11 +422,11 @@ func TestCSHAKEAccumulated(t *testing.T) { // console.log(bytesToHex(acc.xof(32))); // t.Run("cSHAKE128", func(t *testing.T) { - testCSHAKEAccumulated(t, NewCShake128, rate128, + testCSHAKEAccumulated(t, NewCShake128, rateK256, "bb14f8657c6ec5403d0b0e2ef3d3393497e9d3b1a9a9e8e6c81dbaa5fd809252") }) t.Run("cSHAKE256", func(t *testing.T) { - testCSHAKEAccumulated(t, NewCShake256, rate256, + testCSHAKEAccumulated(t, NewCShake256, rateK512, "0baaf9250c6e25f0c14ea5c7f9bfde54c8a922c8276437db28f3895bdf6eeeef") }) } @@ -486,6 +487,43 @@ func TestCSHAKELargeS(t *testing.T) { } } +func TestMarshalUnmarshal(t *testing.T) { + t.Run("SHA3-224", func(t *testing.T) { testMarshalUnmarshal(t, New224()) }) + t.Run("SHA3-256", func(t *testing.T) { testMarshalUnmarshal(t, New256()) }) + t.Run("SHA3-384", func(t *testing.T) { testMarshalUnmarshal(t, New384()) }) + t.Run("SHA3-512", func(t *testing.T) { testMarshalUnmarshal(t, New512()) }) + t.Run("SHAKE128", func(t *testing.T) { testMarshalUnmarshal(t, NewShake128()) }) + t.Run("SHAKE256", func(t *testing.T) { testMarshalUnmarshal(t, NewShake256()) }) + t.Run("cSHAKE128", func(t *testing.T) { testMarshalUnmarshal(t, NewCShake128([]byte("N"), []byte("S"))) }) + t.Run("cSHAKE256", func(t *testing.T) { testMarshalUnmarshal(t, NewCShake256([]byte("N"), []byte("S"))) }) + t.Run("Keccak-256", func(t *testing.T) { testMarshalUnmarshal(t, NewLegacyKeccak256()) }) + t.Run("Keccak-512", func(t *testing.T) { testMarshalUnmarshal(t, NewLegacyKeccak512()) }) +} + +// TODO(filippo): move this to crypto/internal/cryptotest. +func testMarshalUnmarshal(t *testing.T, h hash.Hash) { + buf := make([]byte, 200) + rand.Read(buf) + n := rand.Intn(200) + h.Write(buf) + want := h.Sum(nil) + h.Reset() + h.Write(buf[:n]) + b, err := h.(encoding.BinaryMarshaler).MarshalBinary() + if err != nil { + t.Errorf("MarshalBinary: %v", err) + } + h.Write(bytes.Repeat([]byte{0}, 200)) + if err := h.(encoding.BinaryUnmarshaler).UnmarshalBinary(b); err != nil { + t.Errorf("UnmarshalBinary: %v", err) + } + h.Write(buf[n:]) + got := h.Sum(nil) + if !bytes.Equal(got, want) { + t.Errorf("got %x, want %x", got, want) + } +} + // BenchmarkPermutationFunction measures the speed of the permutation function // with no input data. func BenchmarkPermutationFunction(b *testing.B) { diff --git a/sha3/shake.go b/sha3/shake.go index 6d75811fd4..a6b3a4281f 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -16,7 +16,9 @@ package sha3 // [2] https://doi.org/10.6028/NIST.SP.800-185 import ( + "bytes" "encoding/binary" + "errors" "hash" "io" "math/bits" @@ -51,14 +53,6 @@ type cshakeState struct { initBlock []byte } -// Consts for configuring initial SHA-3 state -const ( - dsbyteShake = 0x1f - dsbyteCShake = 0x04 - rate128 = 168 - rate256 = 136 -) - func bytepad(data []byte, rate int) []byte { out := make([]byte, 0, 9+len(data)+rate-1) out = append(out, leftEncode(uint64(rate))...) @@ -112,6 +106,30 @@ func (c *state) Clone() ShakeHash { return c.clone() } +func (c *cshakeState) MarshalBinary() ([]byte, error) { + return c.AppendBinary(make([]byte, 0, marshaledSize+len(c.initBlock))) +} + +func (c *cshakeState) AppendBinary(b []byte) ([]byte, error) { + b, err := c.state.AppendBinary(b) + if err != nil { + return nil, err + } + b = append(b, c.initBlock...) + return b, nil +} + +func (c *cshakeState) UnmarshalBinary(b []byte) error { + if len(b) <= marshaledSize { + return errors.New("sha3: invalid hash state") + } + if err := c.state.UnmarshalBinary(b[:marshaledSize]); err != nil { + return err + } + c.initBlock = bytes.Clone(b[marshaledSize:]) + return nil +} + // NewShake128 creates a new SHAKE128 variable-output-length ShakeHash. // Its generic security strength is 128 bits against all attacks if at // least 32 bytes of its output are used. @@ -127,11 +145,11 @@ func NewShake256() ShakeHash { } func newShake128Generic() *state { - return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} + return &state{rate: rateK256, outputLen: 32, dsbyte: dsbyteShake} } func newShake256Generic() *state { - return &state{rate: rate256, outputLen: 64, dsbyte: dsbyteShake} + return &state{rate: rateK512, outputLen: 64, dsbyte: dsbyteShake} } // NewCShake128 creates a new instance of cSHAKE128 variable-output-length ShakeHash, @@ -144,7 +162,7 @@ func NewCShake128(N, S []byte) ShakeHash { if len(N) == 0 && len(S) == 0 { return NewShake128() } - return newCShake(N, S, rate128, 32, dsbyteCShake) + return newCShake(N, S, rateK256, 32, dsbyteCShake) } // NewCShake256 creates a new instance of cSHAKE256 variable-output-length ShakeHash, @@ -157,7 +175,7 @@ func NewCShake256(N, S []byte) ShakeHash { if len(N) == 0 && len(S) == 0 { return NewShake256() } - return newCShake(N, S, rate256, 64, dsbyteCShake) + return newCShake(N, S, rateK512, 64, dsbyteCShake) } // ShakeSum128 writes an arbitrary-length digest of data into hash. From 71ed71b4faf97caafd1863fed003e9ac311f10ee Mon Sep 17 00:00:00 2001 From: Ian Lance Taylor Date: Thu, 31 Oct 2024 15:31:36 -0700 Subject: [PATCH 90/95] README: don't recommend go get These days people will just import the packages and the go tool will do the right thing. We don't need to explain it. Add a pointer to the git repo, though. For golang/go#62645 Change-Id: I8b1e4a877bd83fe6891688a44d27a6c7902c8979 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/624155 LUCI-TryBot-Result: Go LUCI Commit-Queue: Ian Lance Taylor Reviewed-by: Ian Lance Taylor Auto-Submit: Ian Lance Taylor Reviewed-by: Roland Shoemaker --- README.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 92f73cdfbf..f69c623ff8 100644 --- a/README.md +++ b/README.md @@ -2,20 +2,17 @@ [![Go Reference](https://pkg.go.dev/badge/golang.org/x/crypto.svg)](https://pkg.go.dev/golang.org/x/crypto) -This repository holds supplementary Go cryptography libraries. - -## Download/Install - -The easiest way to install is to run `go get -u golang.org/x/crypto/...`. You -can also manually git clone the repository to `$GOPATH/src/golang.org/x/crypto`. +This repository holds supplementary Go cryptography packages. ## Report Issues / Send Patches This repository uses Gerrit for code changes. To learn how to submit changes to -this repository, see https://golang.org/doc/contribute.html. +this repository, see https://go.dev/doc/contribute. + +The git repository is https://go.googlesource.com/crypto. The main issue tracker for the crypto repository is located at -https://github.com/golang/go/issues. Prefix your issue with "x/crypto:" in the +https://go.dev/issues. Prefix your issue with "x/crypto:" in the subject line, so it is easy to find. Note that contributions to the cryptography package receive additional scrutiny From 6018723c74059e3b91c84268b212c2f6cdab1f64 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Thu, 7 Nov 2024 22:09:40 +0000 Subject: [PATCH 91/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Ib4976eb0b062bcd71c208afc9ff53e8c3068fbf9 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/626377 Reviewed-by: David Chase Auto-Submit: Gopher Robot Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index ecf61b3e18..d4bb2e13f3 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.26.0 - golang.org/x/term v0.25.0 + golang.org/x/sys v0.27.0 + golang.org/x/term v0.26.0 ) -require golang.org/x/text v0.19.0 // indirect +require golang.org/x/text v0.20.0 // indirect diff --git a/go.sum b/go.sum index 5b12ab8eee..b626a3d76d 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= -golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= +golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= From 8c4e668694ccbaa1be4785da7e7a40f2ef93152b Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 25 Nov 2024 16:01:21 +0000 Subject: [PATCH 92/95] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: Ic5267bf9d66b676e1cfc5fc2ae153afb8f33b29c Reviewed-on: https://go-review.googlesource.com/c/crypto/+/631635 Auto-Submit: Gopher Robot Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI --- x509roots/fallback/bundle.go | 290 ----------------------------------- 1 file changed, 290 deletions(-) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index 80ce10fe3c..e025bd5f54 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -161,98 +161,6 @@ ZCzJJ7VLkn5l/9Mt4blOvH+kQSGQQXemOR/qnuOf0GZvBeyqdn6/axag67XH/JJU LysRJyU3eExRarDzzFhdFPFqSBX/wge2sY0PjlxQRrM9vwGYT7JZVEc+NHt4bVaT LnPqZih4zR0Uv6CPLy64Lo7yFIrM6bV8+2ydDKXhlg== -----END CERTIFICATE----- -# CN=AffirmTrust Commercial,O=AffirmTrust,C=US -# 0376ab1d54c5f9803ce4b2e201a0ee7eef7b57b636e8a93c9b8d4860c96f5fa7 ------BEGIN CERTIFICATE----- -MIIDTDCCAjSgAwIBAgIId3cGJyapsXwwDQYJKoZIhvcNAQELBQAwRDELMAkGA1UE -BhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZpcm1UcnVz -dCBDb21tZXJjaWFsMB4XDTEwMDEyOTE0MDYwNloXDTMwMTIzMTE0MDYwNlowRDEL -MAkGA1UEBhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZp -cm1UcnVzdCBDb21tZXJjaWFsMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC -AQEA9htPZwcroRX1BiLLHwGy43NFBkRJLLtJJRTWzsO3qyxPxkEylFf6EqdbDuKP -Hx6GGaeqtS25Xw2Kwq+FNXkyLbscYjfysVtKPcrNcV/pQr6U6Mje+SJIZMblq8Yr -ba0F8PrVC8+a5fBQpIs7R6UjW3p6+DM/uO+Zl+MgwdYoic+U+7lF7eNAFxHUdPAL -MeIrJmqbTFeurCA+ukV6BfO9m2kVrn1OIGPENXY6BwLJN/3HR+7o8XYdcxXyl6S1 -yHp52UKqK39c/s4mT6NmgTWvRLpUHhwwMmWd5jyTXlBOeuM61G7MGvv50jeuJCqr -VwMiKA1JdX+3KNp1v47j3A55MQIDAQABo0IwQDAdBgNVHQ4EFgQUnZPGU4teyq8/ -nx4P5ZmVvCT2lI8wDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwDQYJ -KoZIhvcNAQELBQADggEBAFis9AQOzcAN/wr91LoWXym9e2iZWEnStB03TX8nfUYG -XUPGhi4+c7ImfU+TqbbEKpqrIZcUsd6M06uJFdhrJNTxFq7YpFzUf1GO7RgBsZNj -vbz4YYCanrHOQnDiqX0GJX0nof5v7LMeJNrjS1UaADs1tDvZ110w/YETifLCBivt -Z8SOyUOyXGsViQK8YvxO8rUzqrJv0wqiUOP2O+guRMLbZjipM1ZI8W0bM40NjD9g -N53Tym1+NH4Nn3J2ixufcv1SNUFFApYvHLKac0khsUlHRUe072o0EclNmsxZt9YC -nlpOZbWUrhvfKbAW8b8Angc6F2S1BLUjIZkKlTuXfO8= ------END CERTIFICATE----- -# CN=AffirmTrust Networking,O=AffirmTrust,C=US -# 0a81ec5a929777f145904af38d5d509f66b5e2c58fcdb531058b0e17f3f0b41b ------BEGIN CERTIFICATE----- -MIIDTDCCAjSgAwIBAgIIfE8EORzUmS0wDQYJKoZIhvcNAQEFBQAwRDELMAkGA1UE -BhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZpcm1UcnVz -dCBOZXR3b3JraW5nMB4XDTEwMDEyOTE0MDgyNFoXDTMwMTIzMTE0MDgyNFowRDEL -MAkGA1UEBhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZp -cm1UcnVzdCBOZXR3b3JraW5nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC -AQEAtITMMxcua5Rsa2FSoOujz3mUTOWUgJnLVWREZY9nZOIG41w3SfYvm4SEHi3y -YJ0wTsyEheIszx6e/jarM3c1RNg1lho9Nuh6DtjVR6FqaYvZ/Ls6rnla1fTWcbua -kCNrmreIdIcMHl+5ni36q1Mr3Lt2PpNMCAiMHqIjHNRqrSK6mQEubWXLviRmVSRL -QESxG9fhwoXA3hA/Pe24/PHxI1Pcv2WXb9n5QHGNfb2V1M6+oF4nI979ptAmDgAp -6zxG8D1gvz9Q0twmQVGeFDdCBKNwV6gbh+0t+nvujArjqWaJGctB+d1ENmHP4ndG -yH329JKBNv3bNPFyfvMMFr20FQIDAQABo0IwQDAdBgNVHQ4EFgQUBx/S55zawm6i -QLSwelAQUHTEyL0wDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwDQYJ -KoZIhvcNAQEFBQADggEBAIlXshZ6qML91tmbmzTCnLQyFE2npN/svqe++EPbkTfO -tDIuUFUaNU52Q3Eg75N3ThVwLofDwR1t3Mu1J9QsVtFSUzpE0nPIxBsFZVpikpzu -QY0x2+c06lkh1QF612S4ZDnNye2v7UsDSKegmQGA3GWjNq5lWUhPgkvIZfFXHeVZ -Lgo/bNjR9eUJtGxUAArgFU2HdW23WJZa3W3SAKD0m0i+wzekujbgfIeFlxoVot4u -olu9rxj5kFDNcFn4J2dHy8egBzp90SxdbBk6ZrV9/ZFvgrG+CJPbFEfxojfHRZ48 -x3evZKiT3/Zpg4Jg8klCNO1aAFSFHBY2kgxc+qatv9s= ------END CERTIFICATE----- -# CN=AffirmTrust Premium ECC,O=AffirmTrust,C=US -# bd71fdf6da97e4cf62d1647add2581b07d79adf8397eb4ecba9c5e8488821423 ------BEGIN CERTIFICATE----- -MIIB/jCCAYWgAwIBAgIIdJclisc/elQwCgYIKoZIzj0EAwMwRTELMAkGA1UEBhMC -VVMxFDASBgNVBAoMC0FmZmlybVRydXN0MSAwHgYDVQQDDBdBZmZpcm1UcnVzdCBQ -cmVtaXVtIEVDQzAeFw0xMDAxMjkxNDIwMjRaFw00MDEyMzExNDIwMjRaMEUxCzAJ -BgNVBAYTAlVTMRQwEgYDVQQKDAtBZmZpcm1UcnVzdDEgMB4GA1UEAwwXQWZmaXJt -VHJ1c3QgUHJlbWl1bSBFQ0MwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAQNMF4bFZ0D -0KF5Nbc6PJJ6yhUczWLznCZcBz3lVPqj1swS6vQUX+iOGasvLkjmrBhDeKzQN8O9 -ss0s5kfiGuZjuD0uL3jET9v0D6RoTFVya5UdThhClXjMNzyR4ptlKymjQjBAMB0G -A1UdDgQWBBSaryl6wBE1NSZRMADDav5A1a7WPDAPBgNVHRMBAf8EBTADAQH/MA4G -A1UdDwEB/wQEAwIBBjAKBggqhkjOPQQDAwNnADBkAjAXCfOHiFBar8jAQr9HX/Vs -aobgxCd05DhT1wV/GzTjxi+zygk8N53X57hG8f2h4nECMEJZh0PUUd+60wkyWs6I -flc9nF9Ca/UHLbXwgpP5WW+uZPpY5Yse42O+tYHNbwKMeQ== ------END CERTIFICATE----- -# CN=AffirmTrust Premium,O=AffirmTrust,C=US -# 70a73f7f376b60074248904534b11482d5bf0e698ecc498df52577ebf2e93b9a ------BEGIN CERTIFICATE----- -MIIFRjCCAy6gAwIBAgIIbYwURrGmCu4wDQYJKoZIhvcNAQEMBQAwQTELMAkGA1UE -BhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MRwwGgYDVQQDDBNBZmZpcm1UcnVz -dCBQcmVtaXVtMB4XDTEwMDEyOTE0MTAzNloXDTQwMTIzMTE0MTAzNlowQTELMAkG -A1UEBhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MRwwGgYDVQQDDBNBZmZpcm1U -cnVzdCBQcmVtaXVtMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEAxBLf -qV/+Qd3d9Z+K4/as4Tx4mrzY8H96oDMq3I0gW64tb+eT2TZwamjPjlGjhVtnBKAQ -JG9dKILBl1fYSCkTtuG+kU3fhQxTGJoeJKJPj/CihQvL9Cl/0qRY7iZNyaqoe5rZ -+jjeRFcV5fiMyNlI4g0WJx0eyIOFJbe6qlVBzAMiSy2RjYvmia9mx+n/K+k8rNrS -s8PhaJyJ+HoAVt70VZVs+7pk3WKL3wt3MutizCaam7uqYoNMtAZ6MMgpv+0GTZe5 -HMQxK9VfvFMSF5yZVylmd2EhMQcuJUmdGPLu8ytxjLW6OQdJd/zvLpKQBY0tL3d7 -70O/Nbua2Plzpyzy0FfuKE4mX4+QaAkvuPjcBukumj5Rp9EixAqnOEhss/n/fauG -V+O61oV4d7pD6kh/9ti+I20ev9E2bFhc8e6kGVQa9QPSdubhjL08s9NIS+LI+H+S -qHZGnEJlPqQewQcDWkYtuJfzt9WyVSHvutxMAJf7FJUnM7/oQ0dG0giZFmA7mn7S -5u046uwBHjxIVkkJx0w3AJ6IDsBz4W9m6XJHMD4Q5QsDyZpCAGzFlH5hxIrff4Ia -C1nEWTJ3s7xgaVY5/bQGeyzWZDbZvUjthB9+pSKPKrhC9IK31FOQeE4tGv2Bb0TX -OwF0lkLgAOIua+rF7nKsu7/+6qqo+Nz2snmKtmcCAwEAAaNCMEAwHQYDVR0OBBYE -FJ3AZ6YMItkm9UWrpmVSESfYRaxjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/ -BAQDAgEGMA0GCSqGSIb3DQEBDAUAA4ICAQCzV00QYk465KzquByvMiPIs0laUZx2 -KI15qldGF9X1Uva3ROgIRL8YhNILgM3FEv0AVQVhh0HctSSePMTYyPtwni94loMg -Nt58D2kTiKV1NpgIpsbfrM7jWNa3Pt668+s0QNiigfV4Py/VpfzZotReBA4Xrf5B -8OWycvpEgjNC6C1Y91aMYj+6QrCcDFx+LmUmXFNPALJ4fqENmS2NuB2OosSw/WDQ -MKSOyARiqcTtNd56l+0OOF6SL5Nwpamcb6d9Ex1+xghIsV5n61EIJenmJWtSKZGc -0jlzCFfemQa0W50QBuHCAKi4HEoCChTQwUHK+4w1IX2COPKpVJEZNZOUbWo6xbLQ -u4mGk+ibyQ86p3q4ofB4Rvr8Ny/lioTz3/4E2aFooC8k4gmVBtWVyuEklut89pMF -u+1z6S3RdTnX5yTb2E5fQ4+e0BQ5v1VwSJlXMbSc7kqYA5YwH2AG7hsj/oFgIxpH -YoWlzBk0gG+zrBrjn/B7SK3VAdlntqlyk+otZrWyuOQ9PLLvTIzq6we/qzWaVYa8 -GKa1qF60g2xraUDTn9zxw2lrueFtCfTxqlB2Cnp9ehehVZZCmTEJ3WARjQUwfuaO -RtGdFNrHF+QFlozEJLUbzxQHskD4o55BhrwE0GuWyCqANP2/7waj3VjFhT0+j/6e -KeC2uAloGRwYQw== ------END CERTIFICATE----- # CN=Amazon Root CA 1,O=Amazon,C=US # 8ecde6884f3d87b1125ba31ac3fcb13d7016de7f57cc904fe1cb97c6ae98196e -----BEGIN CERTIFICATE----- @@ -1385,147 +1293,6 @@ r/OSmbaz5mEP0oUA51Aa5BuVnRmhuZyxm7EAHu/QD09CbMkKvO5D+jpxpchNJqU1 /YldvIViHTLSoCtU7ZpXwdv6EM8Zt4tKG48BtieVU+i2iW1bvGjUI+iLUaJW+fCm gKDWHrO8Dw9TdSmq6hN35N6MgSGtBxBHEa2HPQfRdbzP82Z+ -----END CERTIFICATE----- -# CN=Entrust Root Certification Authority - EC1,OU=See www.entrust.net/legal-terms+OU=(c) 2012 Entrust\, Inc. - for authorized use only,O=Entrust\, Inc.,C=US -# 02ed0eb28c14da45165c566791700d6451d7fb56f0b2ab1d3b8eb070e56edff5 ------BEGIN CERTIFICATE----- -MIIC+TCCAoCgAwIBAgINAKaLeSkAAAAAUNCR+TAKBggqhkjOPQQDAzCBvzELMAkG -A1UEBhMCVVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3 -d3cuZW50cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDEyIEVu -dHJ1c3QsIEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEzMDEGA1UEAxMq -RW50cnVzdCBSb290IENlcnRpZmljYXRpb24gQXV0aG9yaXR5IC0gRUMxMB4XDTEy -MTIxODE1MjUzNloXDTM3MTIxODE1NTUzNlowgb8xCzAJBgNVBAYTAlVTMRYwFAYD -VQQKEw1FbnRydXN0LCBJbmMuMSgwJgYDVQQLEx9TZWUgd3d3LmVudHJ1c3QubmV0 -L2xlZ2FsLXRlcm1zMTkwNwYDVQQLEzAoYykgMjAxMiBFbnRydXN0LCBJbmMuIC0g -Zm9yIGF1dGhvcml6ZWQgdXNlIG9ubHkxMzAxBgNVBAMTKkVudHJ1c3QgUm9vdCBD -ZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEVDMTB2MBAGByqGSM49AgEGBSuBBAAi -A2IABIQTydC6bUF74mzQ61VfZgIaJPRbiWlH47jCffHyAsWfoPZb1YsGGYZPUxBt -ByQnoaD41UcZYUx9ypMn6nQM72+WCf5j7HBdNq1nd67JnXxVRDqiY1Ef9eNi1KlH -Bz7MIKNCMEAwDgYDVR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8wHQYDVR0O -BBYEFLdj5xrdjekIplWDpOBqUEFlEUJJMAoGCCqGSM49BAMDA2cAMGQCMGF52OVC -R98crlOZF7ZvHH3hvxGU0QOIdeSNiaSKd0bebWHvAvX7td/M/k7//qnmpwIwW5nX -hTcGtXsI/esni0qU+eH6p44mCOh8kmhtc9hvJqwhAriZtyZBWyVgrtBIGu4G ------END CERTIFICATE----- -# CN=Entrust Root Certification Authority - G2,OU=See www.entrust.net/legal-terms+OU=(c) 2009 Entrust\, Inc. - for authorized use only,O=Entrust\, Inc.,C=US -# 43df5774b03e7fef5fe40d931a7bedf1bb2e6b42738c4e6d3841103d3aa7f339 ------BEGIN CERTIFICATE----- -MIIEPjCCAyagAwIBAgIESlOMKDANBgkqhkiG9w0BAQsFADCBvjELMAkGA1UEBhMC -VVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50 -cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3Qs -IEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVz -dCBSb290IENlcnRpZmljYXRpb24gQXV0aG9yaXR5IC0gRzIwHhcNMDkwNzA3MTcy -NTU0WhcNMzAxMjA3MTc1NTU0WjCBvjELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVu -dHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwt -dGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0 -aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVzdCBSb290IENlcnRpZmlj -YXRpb24gQXV0aG9yaXR5IC0gRzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK -AoIBAQC6hLZy254Ma+KZ6TABp3bqMriVQRrJ2mFOWHLP/vaCeb9zYQYKpSfYs1/T -RU4cctZOMvJyig/3gxnQaoCAAEUesMfnmr8SVycco2gvCoe9amsOXmXzHHfV1IWN -cCG0szLni6LVhjkCsbjSR87kyUnEO6fe+1R9V77w6G7CebI6C1XiUJgWMhNcL3hW -wcKUs/Ja5CeanyTXxuzQmyWC48zCxEXFjJd6BmsqEZ+pCm5IO2/b1BEZQvePB7/1 -U1+cPvQXLOZprE4yTGJ36rfo5bs0vBmLrpxR57d+tVOxMyLlbc9wPBr64ptntoP0 -jaWvYkxN4FisZDQSA/i2jZRjJKRxAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAP -BgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBRqciZ60B7vfec7aVHUbI2fkBJmqzAN -BgkqhkiG9w0BAQsFAAOCAQEAeZ8dlsa2eT8ijYfThwMEYGprmi5ZiXMRrEPR9RP/ -jTkrwPK9T3CMqS/qF8QLVJ7UG5aYMzyorWKiAHarWWluBh1+xLlEjZivEtRh2woZ -Rkfz6/djwUAFQKXSt/S1mja/qYh2iARVBCuch38aNzx+LaUa2NSJXsq9rD1s2G2v -1fN2D807iDginWyTmsQ9v4IbZT+mD12q/OWyFcq1rca8PdCE6OoGcrBNOTJ4vz4R -nAuknZoh8/CbCzB428Hch0P+vGOaysXCHMnHjf87ElgI5rY97HosTvuDls4MPGmH -VHOkc8KT/1EQrBVUAdj8BbGJoX90g5pJ19xOe4pIb4tF9g== ------END CERTIFICATE----- -# CN=Entrust Root Certification Authority - G4,OU=See www.entrust.net/legal-terms+OU=(c) 2015 Entrust\, Inc. - for authorized use only,O=Entrust\, Inc.,C=US -# db3517d1f6732a2d5ab97c533ec70779ee3270a62fb4ac4238372460e6f01e88 ------BEGIN CERTIFICATE----- -MIIGSzCCBDOgAwIBAgIRANm1Q3+vqTkPAAAAAFVlrVgwDQYJKoZIhvcNAQELBQAw -gb4xCzAJBgNVBAYTAlVTMRYwFAYDVQQKEw1FbnRydXN0LCBJbmMuMSgwJgYDVQQL -Ex9TZWUgd3d3LmVudHJ1c3QubmV0L2xlZ2FsLXRlcm1zMTkwNwYDVQQLEzAoYykg -MjAxNSBFbnRydXN0LCBJbmMuIC0gZm9yIGF1dGhvcml6ZWQgdXNlIG9ubHkxMjAw -BgNVBAMTKUVudHJ1c3QgUm9vdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEc0 -MB4XDTE1MDUyNzExMTExNloXDTM3MTIyNzExNDExNlowgb4xCzAJBgNVBAYTAlVT -MRYwFAYDVQQKEw1FbnRydXN0LCBJbmMuMSgwJgYDVQQLEx9TZWUgd3d3LmVudHJ1 -c3QubmV0L2xlZ2FsLXRlcm1zMTkwNwYDVQQLEzAoYykgMjAxNSBFbnRydXN0LCBJ -bmMuIC0gZm9yIGF1dGhvcml6ZWQgdXNlIG9ubHkxMjAwBgNVBAMTKUVudHJ1c3Qg -Um9vdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEc0MIICIjANBgkqhkiG9w0B -AQEFAAOCAg8AMIICCgKCAgEAsewsQu7i0TD/pZJH4i3DumSXbcr3DbVZwbPLqGgZ -2K+EbTBwXX7zLtJTmeH+H17ZSK9dE43b/2MzTdMAArzE+NEGCJR5WIoV3imz/f3E -T+iq4qA7ec2/a0My3dl0ELn39GjUu9CH1apLiipvKgS1sqbHoHrmSKvS0VnM1n4j -5pds8ELl3FFLFUHtSUrJ3hCX1nbB76W1NhSXNdh4IjVS70O92yfbYVaCNNzLiGAM -C1rlLAHGVK/XqsEQe9IFWrhAnoanw5CGAlZSCXqc0ieCU0plUmr1POeo8pyvi73T -DtTUXm6Hnmo9RR3RXRv06QqsYJn7ibT/mCzPfB3pAqoEmh643IhuJbNsZvc8kPNX -wbMv9W3y+8qh+CmdRouzavbmZwe+LGcKKh9asj5XxNMhIWNlUpEbsZmOeX7m640A -2Vqq6nPopIICR5b+W45UYaPrL0swsIsjdXJ8ITzI9vF01Bx7owVV7rtNOzK+mndm -nqxpkCIHH2E6lr7lmk/MBTwoWdPBDFSoWWG9yHJM6Nyfh3+9nEg2XpWjDrk4JFX8 -dWbrAuMINClKxuMrLzOg2qOGpRKX/YAr2hRC45K9PvJdXmd0LhyIRyk0X+IyqJwl -N4y6mACXi0mWHv0liqzc2thddG5msP9E36EYxr5ILzeUePiVSj9/E15dWf10hkNj -c0kCAwEAAaNCMEAwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwHQYD -VR0OBBYEFJ84xFYjwznooHFs6FRM5Og6sb9nMA0GCSqGSIb3DQEBCwUAA4ICAQAS -5UKme4sPDORGpbZgQIeMJX6tuGguW8ZAdjwD+MlZ9POrYs4QjbRaZIxowLByQzTS -Gwv2LFPSypBLhmb8qoMi9IsabyZIrHZ3CL/FmFz0Jomee8O5ZDIBf9PD3Vht7LGr -hFV0d4QEJ1JrhkzO3bll/9bGXp+aEJlLdWr+aumXIOTkdnrG0CSqkM0gkLpHZPt/ -B7NTeLUKYvJzQ85BK4FqLoUWlFPUa19yIqtRLULVAJyZv967lDtX/Zr1hstWO1uI -AeV8KEsD+UmDfLJ/fOPtjqF/YFOOVZ1QNBIPt5d7bIdKROf1beyAN/BYGW5KaHbw -H5Lk6rWS02FREAutp9lfx1/cH6NcjKF+m7ee01ZvZl4HliDtC3T7Zk6LERXpgUl+ -b7DUUH8i119lAg2m9IUe2K4GS0qn0jFmwvjO5QimpAKWRGhXxNUzzxkvFMSUHHuk -2fCfDrGA4tGeEWSpiBE6doLlYsKA2KSD7ZPvfC+QsDJMlhVoSFLUmQjAJOgc47Ol -IQ6SwJAfzyBfyjs4x7dtOvPmRLgOMWuIjnDrnBdSqEGULoe256YSxXXfW8AKbnuk -5F6G+TaU33fD6Q3AOfF5u0aOq0NZJ7cguyPpVkAh7DE9ZapD8j3fcEThuk0mEDuY -n/PIjhs4ViFqUZPTkcpG2om3PVODLAgfi49T3f+sHw== ------END CERTIFICATE----- -# CN=Entrust Root Certification Authority,OU=www.entrust.net/CPS is incorporated by reference+OU=(c) 2006 Entrust\, Inc.,O=Entrust\, Inc.,C=US -# 73c176434f1bc6d5adf45b0e76e727287c8de57616c1e6e6141a2b2cbc7d8e4c ------BEGIN CERTIFICATE----- -MIIEkTCCA3mgAwIBAgIERWtQVDANBgkqhkiG9w0BAQUFADCBsDELMAkGA1UEBhMC -VVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xOTA3BgNVBAsTMHd3dy5lbnRydXN0 -Lm5ldC9DUFMgaXMgaW5jb3Jwb3JhdGVkIGJ5IHJlZmVyZW5jZTEfMB0GA1UECxMW -KGMpIDIwMDYgRW50cnVzdCwgSW5jLjEtMCsGA1UEAxMkRW50cnVzdCBSb290IENl -cnRpZmljYXRpb24gQXV0aG9yaXR5MB4XDTA2MTEyNzIwMjM0MloXDTI2MTEyNzIw -NTM0MlowgbAxCzAJBgNVBAYTAlVTMRYwFAYDVQQKEw1FbnRydXN0LCBJbmMuMTkw -NwYDVQQLEzB3d3cuZW50cnVzdC5uZXQvQ1BTIGlzIGluY29ycG9yYXRlZCBieSBy -ZWZlcmVuY2UxHzAdBgNVBAsTFihjKSAyMDA2IEVudHJ1c3QsIEluYy4xLTArBgNV -BAMTJEVudHJ1c3QgUm9vdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTCCASIwDQYJ -KoZIhvcNAQEBBQADggEPADCCAQoCggEBALaVtkNC+sZtKm9I35RMOVcF7sN5EUFo -Nu3s/poBj6E4KPz3EEZmLk0eGrEaTsbRwJWIsMn/MYszA9u3g3s+IIRe7bJWKKf4 -4LlAcTfFy0cOlypowCKVYhXbR9n10Cv/gkvJrT7eTNuQgFA/CYqEAOwwCj0Yzfv9 -KlmaI5UXLEWeH25DeW0MXJj+SKfFI0dcXv1u5x609mhF0YaDW6KKjbHjKYD+JXGI -rb68j6xSlkuqUY3kEzEZ6E5Nn9uss2rVvDlUccp6en+Q3X0dgNmBu1kmwhH+5pPi -94DkZfs0Nw4pgHBNrziGLp5/V6+eF67rHMsoIV+2HNjnogQi+dPa2MsCAwEAAaOB -sDCBrTAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zArBgNVHRAEJDAi -gA8yMDA2MTEyNzIwMjM0MlqBDzIwMjYxMTI3MjA1MzQyWjAfBgNVHSMEGDAWgBRo -kORnpKZTgMeGZqTx90tD+4S9bTAdBgNVHQ4EFgQUaJDkZ6SmU4DHhmak8fdLQ/uE -vW0wHQYJKoZIhvZ9B0EABBAwDhsIVjcuMTo0LjADAgSQMA0GCSqGSIb3DQEBBQUA -A4IBAQCT1DCw1wMgKtD5Y+iRDAUgqV8ZyntyTtSx29CW+1RaGSwMCPeyvIWonX9t -O1KzKtvn1ISMY/YPyyYBkVBs9F8U4pN0wBOeMDpQ47RgxRzwIkSNcUesyBrJ6Zua -AGAT/3B+XxFNSRuzFVJ7yVTav52Vr2ua2J7p8eRDjeIRRDq/r72DQnNSi6q7pynP -9WQcCk3RvKqsnyrQ/39/2n3qse0wJcGE2jTSW3iDVuycNsMm4hH2Z0kdkquM++v/ -eu6FSqdQgPCnXEqULl8FmTxSQeDNtGPPAUO6nIPcj2A781q0tHuu2guQOHXvgR1m -0vdXcDazv/wor3ElhVsT/h5/WrQ8 ------END CERTIFICATE----- -# CN=Entrust.net Certification Authority (2048),OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.)+OU=(c) 1999 Entrust.net Limited,O=Entrust.net -# 6dc47172e01cbcb0bf62580d895fe2b8ac9ad4f873801e0c10b9c837d21eb177 ------BEGIN CERTIFICATE----- -MIIEKjCCAxKgAwIBAgIEOGPe+DANBgkqhkiG9w0BAQUFADCBtDEUMBIGA1UEChML -RW50cnVzdC5uZXQxQDA+BgNVBAsUN3d3dy5lbnRydXN0Lm5ldC9DUFNfMjA0OCBp -bmNvcnAuIGJ5IHJlZi4gKGxpbWl0cyBsaWFiLikxJTAjBgNVBAsTHChjKSAxOTk5 -IEVudHJ1c3QubmV0IExpbWl0ZWQxMzAxBgNVBAMTKkVudHJ1c3QubmV0IENlcnRp -ZmljYXRpb24gQXV0aG9yaXR5ICgyMDQ4KTAeFw05OTEyMjQxNzUwNTFaFw0yOTA3 -MjQxNDE1MTJaMIG0MRQwEgYDVQQKEwtFbnRydXN0Lm5ldDFAMD4GA1UECxQ3d3d3 -LmVudHJ1c3QubmV0L0NQU18yMDQ4IGluY29ycC4gYnkgcmVmLiAobGltaXRzIGxp -YWIuKTElMCMGA1UECxMcKGMpIDE5OTkgRW50cnVzdC5uZXQgTGltaXRlZDEzMDEG -A1UEAxMqRW50cnVzdC5uZXQgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkgKDIwNDgp -MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArU1LqRKGsuqjIAcVFmQq -K0vRvwtKTY7tgHalZ7d4QMBzQshowNtTK91euHaYNZOLGp18EzoOH1u3Hs/lJBQe -sYGpjX24zGtLA/ECDNyrpUAkAH90lKGdCCmziAv1h3edVc3kw37XamSrhRSGlVuX -MlBvPci6Zgzj/L24ScF2iUkZ/cCovYmjZy/Gn7xxGWC4LeksyZB2ZnuU4q941mVT -XTzWnLLPKQP5L6RQstRIzgUyVYr9smRMDuSYB3Xbf9+5CFVghTAp+XtIpGmG4zU/ -HoZdenoVve8AjhUiVBcAkCaTvA5JaJG/+EfTnZVCwQ5N328mz8MYIWJmQ3DW1cAH -4QIDAQABo0IwQDAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNV -HQ4EFgQUVeSB0RGAvtiJuQijMfmhJAkWuXAwDQYJKoZIhvcNAQEFBQADggEBADub -j1abMOdTmXx6eadNl9cZlZD7Bh/KM3xGY4+WZiT6QBshJ8rmcnPyT/4xmf3IDExo -U8aAghOY+rat2l098c5u9hURlIIM7j+VrxGrD9cv3h8Dj1csHsm7mhpElesYT6Yf -zX1XEC+bBAlahLVu2B064dae0Wx5XnkcFMXj0EyTO2U87d89vqbllRrDtRnDvV5b -u/8j72gZyxKTJ1wDLW8w0B62GqzeWvfRqqgnpv55gcR5mTNXuhKwqeBCbJPKVt7+ -bYQLCIt+jerXmCHG8+c8eS9enNFMFY3h7CI3zJpDC5fcgJCNs2ebb0gIFVbPv/Er -fF6adulZkMV8gzURZVE= ------END CERTIFICATE----- # CN=FIRMAPROFESIONAL CA ROOT-A WEB,O=Firmaprofesional SA,C=ES,2.5.4.97=#130f56415445532d413632363334303638 # bef256daf26e9c69bdec1602359798f3caf71821a03e018257c53c65617f3d4a -----BEGIN CERTIFICATE----- @@ -2867,29 +2634,6 @@ Af8wDgYDVR0PAQH/BAQDAgEGMB0GA1UdDgQWBBTrQciu/NWeUUj1vYv0hyCTQSvT 4P9mLQlO4E/0BdGF9jVg3PVys0Z9AjBEmEYagoUeYWmJSwdLZrWeqrqgHkHZAXQ6 bkU6iYAZezKYVWOr62Nuk22rGwlgMU4= -----END CERTIFICATE----- -# CN=SecureSign RootCA11,O=Japan Certification Services\, Inc.,C=JP -# bf0feefb9e3a581ad5f9e9db7589985743d261085c4d314f6f5d7259aa421612 ------BEGIN CERTIFICATE----- -MIIDbTCCAlWgAwIBAgIBATANBgkqhkiG9w0BAQUFADBYMQswCQYDVQQGEwJKUDEr -MCkGA1UEChMiSmFwYW4gQ2VydGlmaWNhdGlvbiBTZXJ2aWNlcywgSW5jLjEcMBoG -A1UEAxMTU2VjdXJlU2lnbiBSb290Q0ExMTAeFw0wOTA0MDgwNDU2NDdaFw0yOTA0 -MDgwNDU2NDdaMFgxCzAJBgNVBAYTAkpQMSswKQYDVQQKEyJKYXBhbiBDZXJ0aWZp -Y2F0aW9uIFNlcnZpY2VzLCBJbmMuMRwwGgYDVQQDExNTZWN1cmVTaWduIFJvb3RD -QTExMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA/XeqpRyQBTvLTJsz -i1oURaTnkBbR31fSIRCkF/3frNYfp+TbfPfs37gD2pRY/V1yfIw/XwFndBWW4wI8 -h9uuywGOwvNmxoVF9ALGOrVisq/6nL+k5tSAMJjzDbaTj6nU2DbysPyKyiyhFTOV -MdrAG/LuYpmGYz+/3ZMqg6h2uRMft85OQoWPIucuGvKVCbIFtUROd6EgvanyTgp9 -UK31BQ1FT0Zx/Sg+U/sE2C3XZR1KG/rPO7AxmjVuyIsG0wCR8pQIZUyxNAYAeoni -8McDWc/V1uinMrPmmECGxc0nEovMe863ETxiYAcjPitAbpSACW22s293bzUIUPsC -h8U+iQIDAQABo0IwQDAdBgNVHQ4EFgQUW/hNT7KlhtQ60vFjmqC+CfZXt94wDgYD -VR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEB -AKChOBZmLqdWHyGcBvod7bkixTgm2E5P7KN/ed5GIaGHd48HCJqypMWvDzKYC3xm -KbabfSVSSUOrTC4rbnpwrxYO4wJs+0LmGJ1F2FXI6Dvd5+H0LgscNFxsWEr7jIhQ -X5Ucv+2rIrVls4W6ng+4reV6G4pQOh29Dbx7VFALuUKvVaAYga1lme++5Jy/xIWr -QbJUb9wlze144o4MjQlJ3WN7WmmWAiGovVJZ6X01y8hSyn+B/tlr0/cR7SXf+Of5 -pPpyl4RTDaXQMhhRdlkUbA/r7F+AjHVDg8OFmP9Mni0N5HeDk061lgeLKBObjBmN -QSdJQO7e5iNEOdyhIta6A/I= ------END CERTIFICATE----- # CN=SecureTrust CA,O=SecureTrust Corporation,C=US # f1c1b50ae5a20dd8030ec9f6bc24823dd367b5255759b4e71b61fce9f7375d73 -----BEGIN CERTIFICATE----- @@ -2930,40 +2674,6 @@ BAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAKBggqhkjOPQQDAwNoADBlAjAVXUI9/Lbu 9zuxNuie9sRGKEkz0FhDKmMpzE2xtHqiuQ04pV1IKv3LsnNdo4gIxwwCMQDAqy0O be0YottT6SXbVQjgUMzfRGEWgqtJsLKB7HOHeLRMsmIbEvoWTSVLY70eN9k= -----END CERTIFICATE----- -# CN=Security Communication RootCA3,O=SECOM Trust Systems CO.\,LTD.,C=JP -# 24a55c2ab051442d0617766541239a4ad032d7c55175aa34ffde2fbc4f5c5294 ------BEGIN CERTIFICATE----- -MIIFfzCCA2egAwIBAgIJAOF8N0D9G/5nMA0GCSqGSIb3DQEBDAUAMF0xCzAJBgNV -BAYTAkpQMSUwIwYDVQQKExxTRUNPTSBUcnVzdCBTeXN0ZW1zIENPLixMVEQuMScw -JQYDVQQDEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTMwHhcNMTYwNjE2 -MDYxNzE2WhcNMzgwMTE4MDYxNzE2WjBdMQswCQYDVQQGEwJKUDElMCMGA1UEChMc -U0VDT00gVHJ1c3QgU3lzdGVtcyBDTy4sTFRELjEnMCUGA1UEAxMeU2VjdXJpdHkg -Q29tbXVuaWNhdGlvbiBSb290Q0EzMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIIC -CgKCAgEA48lySfcw3gl8qUCBWNO0Ot26YQ+TUG5pPDXC7ltzkBtnTCHsXzW7OT4r -CmDvu20rhvtxosis5FaU+cmvsXLUIKx00rgVrVH+hXShuRD+BYD5UpOzQD11EKzA -lrenfna84xtSGc4RHwsENPXY9Wk8d/Nk9A2qhd7gCVAEF5aEt8iKvE1y/By7z/MG -TfmfZPd+pmaGNXHIEYBMwXFAWB6+oHP2/D5Q4eAvJj1+XCO1eXDe+uDRpdYMQXF7 -9+qMHIjH7Iv10S9VlkZ8WjtYO/u62C21Jdp6Ts9EriGmnpjKIG58u4iFW/vAEGK7 -8vknR+/RiTlDxN/e4UG/VHMgly1s2vPUB6PmudhvrvyMGS7TZ2crldtYXLVqAvO4 -g160a75BflcJdURQVc1aEWEhCmHCqYj9E7wtiS/NYeCVvsq1e+F7NGcLH7YMx3we -GVPKp7FKFSBWFHA9K4IsD50VHUeAR/94mQ4xr28+j+2GaR57GIgUssL8gjMunEst -+3A7caoreyYn8xrC3PsXuKHqy6C0rtOUfnrQq8PsOC0RLoi/1D+tEjtCrI8Cbn3M -0V9hvqG8OmpI6iZVIhZdXw3/JzOfGAN0iltSIEdrRU0id4xVJ/CvHozJgyJUt5rQ -T9nO/NkuHJYosQLTA70lUhw0Zk8jq/R3gpYd0VcwCBEF/VfR2ccCAwEAAaNCMEAw -HQYDVR0OBBYEFGQUfPxYchamCik0FW8qy7z8r6irMA4GA1UdDwEB/wQEAwIBBjAP -BgNVHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBDAUAA4ICAQDcAiMI4u8hOscNtybS -YpOnpSNyByCCYN8Y11StaSWSntkUz5m5UoHPrmyKO1o5yGwBQ8IibQLwYs1OY0PA -FNr0Y/Dq9HHuTofjcan0yVflLl8cebsjqodEV+m9NU1Bu0soo5iyG9kLFwfl9+qd -9XbXv8S2gVj/yP9kaWJ5rW4OH3/uHWnlt3Jxs/6lATWUVCvAUm2PVcTJ0rjLyjQI -UYWg9by0F1jqClx6vWPGOi//lkkZhOpn2ASxYfQAW0q3nHE3GYV5v4GwxxMOdnE+ -OoAGrgYWp421wsTL/0ClXI2lyTrtcoHKXJg80jQDdwj98ClZXSEIx2C/pHF7uNke -gr4Jr2VvKKu/S7XuPghHJ6APbw+LP6yVGPO5DtxnVW5inkYO0QR4ynKudtml+LLf -iAlhi+8kTtFZP1rUPcmTPCtk9YENFpb3ksP+MW/oKjJ0DvRMmEoYDjBU1cXrvMUV -nuiZIesnKwkK2/HmcBhWuwzkvvnoEKQTkrgc4NtnHVMDpCKn3F2SEDzq//wbEBrD -2NCcnWXL0CsnMQMeNuE9dnUM/0Umud1RvCPHX9jYhxBAEg09ODfnRDwYwFMJZI// -1ZqmfHAuc1Uh6N//g7kdPjIe1qZ9LPFm6Vwdp6POXiUyK+OVrCoHzrQoeIY8Laad -TdJ0MN1kURXbg4NR16/9M51NZg== ------END CERTIFICATE----- # CN=Starfield Root Certificate Authority - G2,O=Starfield Technologies\, Inc.,L=Scottsdale,ST=Arizona,C=US # 2ce1cb0bf9d2f9e102993fbe215152c3b2dd0cabde1c68e5319b839154dbb7f5 -----BEGIN CERTIFICATE----- From 3e90321ac7bcee3d924ed63ed3ad97be2079cb56 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Wed, 4 Dec 2024 16:27:45 +0000 Subject: [PATCH 93/95] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I580d412fc4a135696d4054f8007593cfa4f64224 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/633480 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Gopher Robot --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index d4bb2e13f3..4a13082a28 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.27.0 - golang.org/x/term v0.26.0 + golang.org/x/sys v0.28.0 + golang.org/x/term v0.27.0 ) -require golang.org/x/text v0.20.0 // indirect +require golang.org/x/text v0.21.0 // indirect diff --git a/go.sum b/go.sum index b626a3d76d..41808cb739 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= -golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= -golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= -golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= -golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= From 7042ebcbe097f305ba3a93f9a22b4befa4b83d29 Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Wed, 4 Dec 2024 10:46:07 -0800 Subject: [PATCH 94/95] openpgp/clearsign: just use rand.Reader in tests Instead of a convoluted fake rand, it is _basically_ just as fast, and fixes errors that pop up due to bad entropy. Fixes golang/go#70682 Change-Id: Ib0f605398d1092b516b03135f602c644be2a060f Reviewed-on: https://go-review.googlesource.com/c/crypto/+/633655 Reviewed-by: Tatiana Bradley LUCI-TryBot-Result: Go LUCI Auto-Submit: Roland Shoemaker Reviewed-by: Filippo Valsorda --- openpgp/clearsign/clearsign_test.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/openpgp/clearsign/clearsign_test.go b/openpgp/clearsign/clearsign_test.go index 051b8f16ff..821c35ff47 100644 --- a/openpgp/clearsign/clearsign_test.go +++ b/openpgp/clearsign/clearsign_test.go @@ -123,24 +123,12 @@ func TestSigning(t *testing.T) { } } -// We use this to make test keys, so that they aren't all the same. -type quickRand byte - -func (qr *quickRand) Read(p []byte) (int, error) { - for i := range p { - p[i] = byte(*qr) - } - *qr++ - return len(p), nil -} - func TestMultiSign(t *testing.T) { if testing.Short() { t.Skip("skipping long test in -short mode") } - zero := quickRand(0) - config := packet.Config{Rand: &zero} + var config packet.Config for nKeys := 0; nKeys < 4; nKeys++ { nextTest: From b4f1988a35dee11ec3e05d6bf3e90b695fbd8909 Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Tue, 3 Dec 2024 09:03:03 -0800 Subject: [PATCH 95/95] ssh: make the public key cache a 1-entry FIFO cache Users of the the ssh package seem to extremely commonly misuse the PublicKeyCallback API, assuming that the key passed in the last call before a connection is established is the key used for authentication. Some users then make authorization decisions based on this key. This property is not documented, and may not be correct, due to the caching behavior of the package, resulting in users making incorrect authorization decisions about the connection. This change makes the cache a one entry FIFO cache, making the assumed property, that the last call to PublicKeyCallback represents the key actually used for authentication, actually hold. Thanks to Damien Tournoud, Patrick Dawkins, Vince Parker, and Jules Duvivier from the Platform.sh / Upsun engineering team for reporting this issue. Fixes golang/go#70779 Fixes CVE-2024-45337 Change-Id: Ife7c7b4045d8b6bcd7e3a417bdfae370c709797f Reviewed-on: https://go-review.googlesource.com/c/crypto/+/635315 Reviewed-by: Roland Shoemaker Auto-Submit: Gopher Robot Reviewed-by: Damien Neil Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI --- ssh/server.go | 15 ++++++++++---- ssh/server_test.go | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/ssh/server.go b/ssh/server.go index c0d1c29e6f..5b5ccd96f4 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -149,7 +149,7 @@ func (s *ServerConfig) AddHostKey(key Signer) { } // cachedPubKey contains the results of querying whether a public key is -// acceptable for a user. +// acceptable for a user. This is a FIFO cache. type cachedPubKey struct { user string pubKeyData []byte @@ -157,7 +157,13 @@ type cachedPubKey struct { perms *Permissions } -const maxCachedPubKeys = 16 +// maxCachedPubKeys is the number of cache entries we store. +// +// Due to consistent misuse of the PublicKeyCallback API, we have reduced this +// to 1, such that the only key in the cache is the most recently seen one. This +// forces the behavior that the last call to PublicKeyCallback will always be +// with the key that is used for authentication. +const maxCachedPubKeys = 1 // pubKeyCache caches tests for public keys. Since SSH clients // will query whether a public key is acceptable before attempting to @@ -179,9 +185,10 @@ func (c *pubKeyCache) get(user string, pubKeyData []byte) (cachedPubKey, bool) { // add adds the given tuple to the cache. func (c *pubKeyCache) add(candidate cachedPubKey) { - if len(c.keys) < maxCachedPubKeys { - c.keys = append(c.keys, candidate) + if len(c.keys) >= maxCachedPubKeys { + c.keys = c.keys[1:] } + c.keys = append(c.keys, candidate) } // ServerConn is an authenticated SSH connection, as seen from the diff --git a/ssh/server_test.go b/ssh/server_test.go index b6d8ab3333..ba1bd10e82 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -5,6 +5,7 @@ package ssh import ( + "bytes" "errors" "fmt" "io" @@ -299,6 +300,54 @@ func TestBannerError(t *testing.T) { } } +func TestPublicKeyCallbackLastSeen(t *testing.T) { + var lastSeenKey PublicKey + + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + serverConf := &ServerConfig{ + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + lastSeenKey = key + fmt.Printf("seen %#v\n", key) + if _, ok := key.(*dsaPublicKey); !ok { + return nil, errors.New("nope") + } + return nil, nil + }, + } + serverConf.AddHostKey(testSigners["ecdsap256"]) + + done := make(chan struct{}) + go func() { + defer close(done) + NewServerConn(c1, serverConf) + }() + + clientConf := ClientConfig{ + User: "user", + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"], testSigners["dsa"], testSigners["ed25519"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, _, _, err = NewClientConn(c2, "", &clientConf) + if err != nil { + t.Fatal(err) + } + <-done + + expectedPublicKey := testSigners["dsa"].PublicKey().Marshal() + lastSeenMarshalled := lastSeenKey.Marshal() + if !bytes.Equal(lastSeenMarshalled, expectedPublicKey) { + t.Errorf("unexpected key: got %#v, want %#v", lastSeenKey, testSigners["dsa"].PublicKey()) + } +} + type markerConn struct { closed uint32 used uint32