From b91329d961d4ba7270e79ed661eb324c299ab812 Mon Sep 17 00:00:00 2001 From: cui fliter Date: Tue, 5 Mar 2024 13:01:48 +0800 Subject: [PATCH 01/57] all: remove redundant words in comments and fix some typos Change-Id: I3078492dc020770aca630e0b362df0212bd41e32 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/569156 Auto-Submit: Ian Lance Taylor Reviewed-by: David Chase Reviewed-by: Nicola Murino Reviewed-by: Ian Lance Taylor Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI --- internal/testenv/exec.go | 4 ++-- md4/md4.go | 2 +- ssh/certs_test.go | 18 +++++++++--------- ssh/example_test.go | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/internal/testenv/exec.go b/internal/testenv/exec.go index 4bacdc3ce8..df5d3c20df 100644 --- a/internal/testenv/exec.go +++ b/internal/testenv/exec.go @@ -57,8 +57,8 @@ func CommandContext(t testing.TB, ctx context.Context, name string, args ...stri // grace periods to clean up: one for the delay between the first // termination signal being sent (via the Cancel callback when the Context // expires) and the process being forcibly terminated (via the WaitDelay - // field), and a second one for the delay becween the process being - // terminated and and the test logging its output for debugging. + // field), and a second one for the delay between the process being + // terminated and the test logging its output for debugging. // // (We want to ensure that the test process itself has enough time to // log the output before it is also terminated.) diff --git a/md4/md4.go b/md4/md4.go index 59d3480693..d1911c2e86 100644 --- a/md4/md4.go +++ b/md4/md4.go @@ -4,7 +4,7 @@ // Package md4 implements the MD4 hash algorithm as defined in RFC 1320. // -// Deprecated: MD4 is cryptographically broken and should should only be used +// Deprecated: MD4 is cryptographically broken and should only be used // where compatibility with legacy systems, not security, is the goal. Instead, // use a secure hash like SHA-256 (from crypto/sha256). package md4 // import "golang.org/x/crypto/md4" diff --git a/ssh/certs_test.go b/ssh/certs_test.go index 97b7486d0c..66000f19a2 100644 --- a/ssh/certs_test.go +++ b/ssh/certs_test.go @@ -367,21 +367,21 @@ func TestCertTypes(t *testing.T) { func TestCertSignWithMultiAlgorithmSigner(t *testing.T) { type testcase struct { - sigAlgo string - algoritms []string + sigAlgo string + algorithms []string } cases := []testcase{ { - sigAlgo: KeyAlgoRSA, - algoritms: []string{KeyAlgoRSA, KeyAlgoRSASHA512}, + sigAlgo: KeyAlgoRSA, + algorithms: []string{KeyAlgoRSA, KeyAlgoRSASHA512}, }, { - sigAlgo: KeyAlgoRSASHA256, - algoritms: []string{KeyAlgoRSASHA256, KeyAlgoRSA, KeyAlgoRSASHA512}, + sigAlgo: KeyAlgoRSASHA256, + algorithms: []string{KeyAlgoRSASHA256, KeyAlgoRSA, KeyAlgoRSASHA512}, }, { - sigAlgo: KeyAlgoRSASHA512, - algoritms: []string{KeyAlgoRSASHA512, KeyAlgoRSASHA256}, + sigAlgo: KeyAlgoRSASHA512, + algorithms: []string{KeyAlgoRSASHA512, KeyAlgoRSASHA256}, }, } @@ -393,7 +393,7 @@ func TestCertSignWithMultiAlgorithmSigner(t *testing.T) { for _, c := range cases { t.Run(c.sigAlgo, func(t *testing.T) { - signer, err := NewSignerWithAlgorithms(testSigners["rsa"].(AlgorithmSigner), c.algoritms) + signer, err := NewSignerWithAlgorithms(testSigners["rsa"].(AlgorithmSigner), c.algorithms) if err != nil { t.Fatalf("NewSignerWithAlgorithms error: %v", err) } diff --git a/ssh/example_test.go b/ssh/example_test.go index 3920832c1a..97b3b6aba6 100644 --- a/ssh/example_test.go +++ b/ssh/example_test.go @@ -384,7 +384,7 @@ func ExampleCertificate_SignCert() { } mas, err := ssh.NewSignerWithAlgorithms(signer.(ssh.AlgorithmSigner), []string{ssh.KeyAlgoRSASHA256}) if err != nil { - log.Fatal("unable to create signer with algoritms: ", err) + log.Fatal("unable to create signer with algorithms: ", err) } certificate := ssh.Certificate{ Key: publicKey, From 8d0d405eedb9e8f9d0d381a5e85235684df0a868 Mon Sep 17 00:00:00 2001 From: Jayanth Krishnamurthy Date: Mon, 19 Feb 2024 13:17:46 -0600 Subject: [PATCH 02/57] x/crypto/chacha20: cleanup chacha_ppc64le.s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adding PCALIGN before the loops - Changing WORD directive with corresponding Vector Merge EVEN/ODD word instructions - Replacing Branch Conditional (BC) with its extended mnemonic form BDNZ - VPERMXOR instruction usage in place of VXOR instructions followed by VRLW (rotate left) for cases of rotating in multiples of 8. This replacements give performace improvement both in time and space of around 7%-8% as listed below using benchstat tool. goos: linux goarch: ppc64le pkg: golang.org/x/crypto/chacha20 cpu: POWER10 | chacha20.prev.out | chacha20.new.out | | sec/op | sec/op vs base | ChaCha20/64 171.9n ± 0% 156.6n ± 1% -8.90% (p=0.002 n=6) ChaCha20/256 165.5n ± 0% 152.4n ± 0% -7.92% (p=0.002 n=6) ChaCha20/10x25 505.8n ± 0% 504.3n ± 2% -0.32% (p=0.589 n=6) ChaCha20/4096 2.265µ ± 0% 2.052µ ± 0% -9.40% (p=0.002 n=6) ChaCha20/100x40 5.359µ ± 3% 5.018µ ± 2% -6.37% (p=0.002 n=6) ChaCha20/65536 35.71µ ± 0% 32.29µ ± 0% -9.57% (p=0.002 n=6) ChaCha20/1000x65 44.63µ ± 0% 41.05µ ± 0% -8.02% (p=0.002 n=6) geomean 2.235µ 2.073µ -7.26% | chacha20.prev.out | chacha20.new.out | | B/s | B/s vs base | ChaCha20/64 355.1Mi ± 0% 389.8Mi ± 1% +9.78% (p=0.002 n=6) ChaCha20/256 1.440Gi ± 0% 1.565Gi ± 0% +8.62% (p=0.002 n=6) ChaCha20/10x25 471.3Mi ± 0% 472.8Mi ± 2% +0.31% (p=0.589 n=6) ChaCha20/4096 1.684Gi ± 0% 1.859Gi ± 0% +10.38% (p=0.002 n=6) ChaCha20/100x40 711.8Mi ± 3% 760.3Mi ± 2% +6.80% (p=0.002 n=6) ChaCha20/65536 1.709Gi ± 0% 1.890Gi ± 0% +10.59% (p=0.002 n=6) ChaCha20/1000x65 1.356Gi ± 0% 1.475Gi ± 0% +8.72% (p=0.002 n=6) geomean 957.3Mi 1.008Gi +7.83% Change-Id: Ib31cb10a2a11eacdacf0272fbfd887eb5ccd8bcb Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564797 Reviewed-by: Lynn Boger Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Run-TryBot: Lynn Boger Reviewed-by: Cherry Mui --- chacha20/chacha_ppc64le.s | 110 ++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 58 deletions(-) diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64le.s index 66aebae258..c672ccf698 100644 --- a/chacha20/chacha_ppc64le.s +++ b/chacha20/chacha_ppc64le.s @@ -33,6 +33,9 @@ #define CONSTBASE R16 #define BLOCKS R17 +// for VPERMXOR +#define MASK R18 + DATA consts<>+0x00(SB)/8, $0x3320646e61707865 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 DATA consts<>+0x10(SB)/8, $0x0000000000000001 @@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 DATA consts<>+0x90(SB)/8, $0x0000000100000000 DATA consts<>+0x98(SB)/8, $0x0000000300000002 -GLOBL consts<>(SB), RODATA, $0xa0 +DATA consts<>+0xa0(SB)/8, $0x5566774411223300 +DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 +DATA consts<>+0xb0(SB)/8, $0x6677445522330011 +DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +GLOBL consts<>(SB), RODATA, $0xc0 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 @@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $48, R10 MOVD $64, R11 SRD $6, LEN, BLOCKS + // for VPERMXOR + MOVD $consts<>+0xa0(SB), MASK + MOVD $16, R20 // V16 LXVW4X (CONSTBASE)(R0), VS48 ADD $80,CONSTBASE @@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // V28 LXVW4X (CONSTBASE)(R11), VS60 + // Load mask constants for VPERMXOR + LXVW4X (MASK)(R0), V20 + LXVW4X (MASK)(R20), V21 + // splat slot from V19 -> V26 VSPLTW $0, V19, V26 @@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $10, R14 MOVD R14, CTR - + PCALIGN $16 loop_outer_vsx: // V0, V1, V2, V3 LXVW4X (R0)(CONSTBASE), VS32 @@ -128,22 +142,17 @@ loop_outer_vsx: VSPLTISW $12, V28 VSPLTISW $8, V29 VSPLTISW $7, V30 - + PCALIGN $16 loop_vsx: VADDUWM V0, V4, V0 VADDUWM V1, V5, V1 VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 - VRLW V15, V27, V15 + VPERMXOR V12, V0, V21, V12 + VPERMXOR V13, V1, V21, V13 + VPERMXOR V14, V2, V21, V14 + VPERMXOR V15, V3, V21, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -165,15 +174,10 @@ loop_vsx: VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 - VRLW V15, V29, V15 + VPERMXOR V12, V0, V20, V12 + VPERMXOR V13, V1, V20, V13 + VPERMXOR V14, V2, V20, V14 + VPERMXOR V15, V3, V20, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -195,15 +199,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V27, V15 - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 + VPERMXOR V15, V0, V21, V15 + VPERMXOR V12, V1, V21, V12 + VPERMXOR V13, V2, V21, V13 + VPERMXOR V14, V3, V21, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -225,15 +224,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V29, V15 - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 + VPERMXOR V15, V0, V20, V15 + VPERMXOR V12, V1, V20, V12 + VPERMXOR V13, V2, V20, V13 + VPERMXOR V14, V3, V20, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -249,48 +243,48 @@ loop_vsx: VRLW V6, V30, V6 VRLW V7, V30, V7 VRLW V4, V30, V4 - BC 16, LT, loop_vsx + BDNZ loop_vsx VADDUWM V12, V26, V12 - WORD $0x13600F8C // VMRGEW V0, V1, V27 - WORD $0x13821F8C // VMRGEW V2, V3, V28 + VMRGEW V0, V1, V27 + VMRGEW V2, V3, V28 - WORD $0x10000E8C // VMRGOW V0, V1, V0 - WORD $0x10421E8C // VMRGOW V2, V3, V2 + VMRGOW V0, V1, V0 + VMRGOW V2, V3, V2 - WORD $0x13A42F8C // VMRGEW V4, V5, V29 - WORD $0x13C63F8C // VMRGEW V6, V7, V30 + VMRGEW V4, V5, V29 + VMRGEW V6, V7, V30 XXPERMDI VS32, VS34, $0, VS33 XXPERMDI VS32, VS34, $3, VS35 XXPERMDI VS59, VS60, $0, VS32 XXPERMDI VS59, VS60, $3, VS34 - WORD $0x10842E8C // VMRGOW V4, V5, V4 - WORD $0x10C63E8C // VMRGOW V6, V7, V6 + VMRGOW V4, V5, V4 + VMRGOW V6, V7, V6 - WORD $0x13684F8C // VMRGEW V8, V9, V27 - WORD $0x138A5F8C // VMRGEW V10, V11, V28 + VMRGEW V8, V9, V27 + VMRGEW V10, V11, V28 XXPERMDI VS36, VS38, $0, VS37 XXPERMDI VS36, VS38, $3, VS39 XXPERMDI VS61, VS62, $0, VS36 XXPERMDI VS61, VS62, $3, VS38 - WORD $0x11084E8C // VMRGOW V8, V9, V8 - WORD $0x114A5E8C // VMRGOW V10, V11, V10 + VMRGOW V8, V9, V8 + VMRGOW V10, V11, V10 - WORD $0x13AC6F8C // VMRGEW V12, V13, V29 - WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + VMRGEW V12, V13, V29 + VMRGEW V14, V15, V30 XXPERMDI VS40, VS42, $0, VS41 XXPERMDI VS40, VS42, $3, VS43 XXPERMDI VS59, VS60, $0, VS40 XXPERMDI VS59, VS60, $3, VS42 - WORD $0x118C6E8C // VMRGOW V12, V13, V12 - WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + VMRGOW V12, V13, V12 + VMRGOW V14, V15, V14 VSPLTISW $4, V27 VADDUWM V26, V27, V26 @@ -431,7 +425,7 @@ tail_vsx: ADD $-1, R11, R12 ADD $-1, INP ADD $-1, OUT - + PCALIGN $16 looptail_vsx: // Copying the result to OUT // in bytes. @@ -439,7 +433,7 @@ looptail_vsx: MOVBZU 1(INP), TMP XOR KEY, TMP, KEY MOVBU KEY, 1(OUT) - BC 16, LT, looptail_vsx + BDNZ looptail_vsx // Clear the stack values STXVW4X VS48, (R11)(R0) From 6f79b5a20cd106b083a02ec17482450d25a1a8f3 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sat, 5 Aug 2023 15:38:29 +0200 Subject: [PATCH 03/57] ssh: add server side multi-step authentication Add support for sending back partial success to the client while handling authentication in the server. This is implemented by a special error that can be returned by any of the authentication methods, which contains the authentication methods to offer next. This patch is based on CL 399075 with some minor changes and the addition of test cases. Fixes golang/go#17889 Fixes golang/go#61447 Fixes golang/go#64974 Co-authored-by: Peter Verraedt Change-Id: I05c8f913bb407d22c2e41c4cbe965e36ab4739b0 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/516355 Reviewed-by: Andrew Lytvynov Reviewed-by: Than McIntosh Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Auto-Submit: Filippo Valsorda --- ssh/server.go | 168 +++++++++----- ssh/server_multi_auth_test.go | 412 ++++++++++++++++++++++++++++++++++ 2 files changed, 530 insertions(+), 50 deletions(-) create mode 100644 ssh/server_multi_auth_test.go diff --git a/ssh/server.go b/ssh/server.go index c2dfe3268c..92d73233ba 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -426,6 +426,35 @@ func (l ServerAuthError) Error() string { return "[" + strings.Join(errs, ", ") + "]" } +// ServerAuthCallbacks defines server-side authentication callbacks. +type ServerAuthCallbacks struct { + // PasswordCallback behaves like [ServerConfig.PasswordCallback]. + PasswordCallback func(conn ConnMetadata, password []byte) (*Permissions, error) + + // PublicKeyCallback behaves like [ServerConfig.PublicKeyCallback]. + PublicKeyCallback func(conn ConnMetadata, key PublicKey) (*Permissions, error) + + // KeyboardInteractiveCallback behaves like [ServerConfig.KeyboardInteractiveCallback]. + KeyboardInteractiveCallback func(conn ConnMetadata, client KeyboardInteractiveChallenge) (*Permissions, error) + + // GSSAPIWithMICConfig behaves like [ServerConfig.GSSAPIWithMICConfig]. + GSSAPIWithMICConfig *GSSAPIWithMICConfig +} + +// PartialSuccessError can be returned by any of the [ServerConfig] +// authentication callbacks to indicate to the client that authentication has +// partially succeeded, but further steps are required. +type PartialSuccessError struct { + // Next defines the authentication callbacks to apply to further steps. The + // available methods communicated to the client are based on the non-nil + // ServerAuthCallbacks fields. + Next ServerAuthCallbacks +} + +func (p *PartialSuccessError) Error() string { + return "ssh: authenticated with partial success" +} + // ErrNoAuth is the error value returned if no // authentication method has been passed yet. This happens as a normal // part of the authentication loop, since the client first tries @@ -441,6 +470,15 @@ func (s *connection) serverAuthenticate(config *ServerConfig) (*Permissions, err authFailures := 0 var authErrs []error var displayedBanner bool + partialSuccessReturned := false + // Set the initial authentication callbacks from the config. They can be + // changed if a PartialSuccessError is returned. + authConfig := ServerAuthCallbacks{ + PasswordCallback: config.PasswordCallback, + PublicKeyCallback: config.PublicKeyCallback, + KeyboardInteractiveCallback: config.KeyboardInteractiveCallback, + GSSAPIWithMICConfig: config.GSSAPIWithMICConfig, + } userAuthLoop: for { @@ -471,6 +509,11 @@ userAuthLoop: return nil, errors.New("ssh: client attempted to negotiate for unknown service: " + userAuthReq.Service) } + if s.user != userAuthReq.User && partialSuccessReturned { + return nil, fmt.Errorf("ssh: client changed the user after a partial success authentication, previous user %q, current user %q", + s.user, userAuthReq.User) + } + s.user = userAuthReq.User if !displayedBanner && config.BannerCallback != nil { @@ -491,20 +534,17 @@ userAuthLoop: switch userAuthReq.Method { case "none": - if config.NoClientAuth { + // We don't allow none authentication after a partial success + // response. + if config.NoClientAuth && !partialSuccessReturned { if config.NoClientAuthCallback != nil { perms, authErr = config.NoClientAuthCallback(s) } else { authErr = nil } } - - // allow initial attempt of 'none' without penalty - if authFailures == 0 { - authFailures-- - } case "password": - if config.PasswordCallback == nil { + if authConfig.PasswordCallback == nil { authErr = errors.New("ssh: password auth not configured") break } @@ -518,17 +558,17 @@ userAuthLoop: return nil, parseError(msgUserAuthRequest) } - perms, authErr = config.PasswordCallback(s, password) + perms, authErr = authConfig.PasswordCallback(s, password) case "keyboard-interactive": - if config.KeyboardInteractiveCallback == nil { + if authConfig.KeyboardInteractiveCallback == nil { authErr = errors.New("ssh: keyboard-interactive auth not configured") break } prompter := &sshClientKeyboardInteractive{s} - perms, authErr = config.KeyboardInteractiveCallback(s, prompter.Challenge) + perms, authErr = authConfig.KeyboardInteractiveCallback(s, prompter.Challenge) case "publickey": - if config.PublicKeyCallback == nil { + if authConfig.PublicKeyCallback == nil { authErr = errors.New("ssh: publickey auth not configured") break } @@ -562,11 +602,18 @@ userAuthLoop: if !ok { candidate.user = s.user candidate.pubKeyData = pubKeyData - candidate.perms, candidate.result = config.PublicKeyCallback(s, pubKey) - if candidate.result == nil && candidate.perms != nil && candidate.perms.CriticalOptions != nil && candidate.perms.CriticalOptions[sourceAddressCriticalOption] != "" { - candidate.result = checkSourceAddress( + candidate.perms, candidate.result = authConfig.PublicKeyCallback(s, pubKey) + _, isPartialSuccessError := candidate.result.(*PartialSuccessError) + + if (candidate.result == nil || isPartialSuccessError) && + candidate.perms != nil && + candidate.perms.CriticalOptions != nil && + candidate.perms.CriticalOptions[sourceAddressCriticalOption] != "" { + if err := checkSourceAddress( s.RemoteAddr(), - candidate.perms.CriticalOptions[sourceAddressCriticalOption]) + candidate.perms.CriticalOptions[sourceAddressCriticalOption]); err != nil { + candidate.result = err + } } cache.add(candidate) } @@ -578,8 +625,8 @@ userAuthLoop: if len(payload) > 0 { return nil, parseError(msgUserAuthRequest) } - - if candidate.result == nil { + _, isPartialSuccessError := candidate.result.(*PartialSuccessError) + if candidate.result == nil || isPartialSuccessError { okMsg := userAuthPubKeyOkMsg{ Algo: algo, PubKey: pubKeyData, @@ -629,11 +676,11 @@ userAuthLoop: perms = candidate.perms } case "gssapi-with-mic": - if config.GSSAPIWithMICConfig == nil { + if authConfig.GSSAPIWithMICConfig == nil { authErr = errors.New("ssh: gssapi-with-mic auth not configured") break } - gssapiConfig := config.GSSAPIWithMICConfig + gssapiConfig := authConfig.GSSAPIWithMICConfig userAuthRequestGSSAPI, err := parseGSSAPIPayload(userAuthReq.Payload) if err != nil { return nil, parseError(msgUserAuthRequest) @@ -689,49 +736,70 @@ userAuthLoop: break userAuthLoop } - authFailures++ - if config.MaxAuthTries > 0 && authFailures >= config.MaxAuthTries { - // If we have hit the max attempts, don't bother sending the - // final SSH_MSG_USERAUTH_FAILURE message, since there are - // no more authentication methods which can be attempted, - // and this message may cause the client to re-attempt - // authentication while we send the disconnect message. - // Continue, and trigger the disconnect at the start of - // the loop. - // - // The SSH specification is somewhat confusing about this, - // RFC 4252 Section 5.1 requires each authentication failure - // be responded to with a respective SSH_MSG_USERAUTH_FAILURE - // message, but Section 4 says the server should disconnect - // after some number of attempts, but it isn't explicit which - // message should take precedence (i.e. should there be a failure - // message than a disconnect message, or if we are going to - // disconnect, should we only send that message.) - // - // Either way, OpenSSH disconnects immediately after the last - // failed authnetication attempt, and given they are typically - // considered the golden implementation it seems reasonable - // to match that behavior. - continue + var failureMsg userAuthFailureMsg + + if partialSuccess, ok := authErr.(*PartialSuccessError); ok { + // After a partial success error we don't allow changing the user + // name and execute the NoClientAuthCallback. + partialSuccessReturned = true + + // In case a partial success is returned, the server may send + // a new set of authentication methods. + authConfig = partialSuccess.Next + + // Reset pubkey cache, as the new PublicKeyCallback might + // accept a different set of public keys. + cache = pubKeyCache{} + + // Send back a partial success message to the user. + failureMsg.PartialSuccess = true + } else { + // Allow initial attempt of 'none' without penalty. + if authFailures > 0 || userAuthReq.Method != "none" { + authFailures++ + } + if config.MaxAuthTries > 0 && authFailures >= config.MaxAuthTries { + // If we have hit the max attempts, don't bother sending the + // final SSH_MSG_USERAUTH_FAILURE message, since there are + // no more authentication methods which can be attempted, + // and this message may cause the client to re-attempt + // authentication while we send the disconnect message. + // Continue, and trigger the disconnect at the start of + // the loop. + // + // The SSH specification is somewhat confusing about this, + // RFC 4252 Section 5.1 requires each authentication failure + // be responded to with a respective SSH_MSG_USERAUTH_FAILURE + // message, but Section 4 says the server should disconnect + // after some number of attempts, but it isn't explicit which + // message should take precedence (i.e. should there be a failure + // message than a disconnect message, or if we are going to + // disconnect, should we only send that message.) + // + // Either way, OpenSSH disconnects immediately after the last + // failed authnetication attempt, and given they are typically + // considered the golden implementation it seems reasonable + // to match that behavior. + continue + } } - var failureMsg userAuthFailureMsg - if config.PasswordCallback != nil { + if authConfig.PasswordCallback != nil { failureMsg.Methods = append(failureMsg.Methods, "password") } - if config.PublicKeyCallback != nil { + if authConfig.PublicKeyCallback != nil { failureMsg.Methods = append(failureMsg.Methods, "publickey") } - if config.KeyboardInteractiveCallback != nil { + if authConfig.KeyboardInteractiveCallback != nil { failureMsg.Methods = append(failureMsg.Methods, "keyboard-interactive") } - if config.GSSAPIWithMICConfig != nil && config.GSSAPIWithMICConfig.Server != nil && - config.GSSAPIWithMICConfig.AllowLogin != nil { + if authConfig.GSSAPIWithMICConfig != nil && authConfig.GSSAPIWithMICConfig.Server != nil && + authConfig.GSSAPIWithMICConfig.AllowLogin != nil { failureMsg.Methods = append(failureMsg.Methods, "gssapi-with-mic") } if len(failureMsg.Methods) == 0 { - return nil, errors.New("ssh: no authentication methods configured but NoClientAuth is also false") + return nil, errors.New("ssh: no authentication methods available") } if err := s.transport.writePacket(Marshal(&failureMsg)); err != nil { diff --git a/ssh/server_multi_auth_test.go b/ssh/server_multi_auth_test.go new file mode 100644 index 0000000000..3b39802437 --- /dev/null +++ b/ssh/server_multi_auth_test.go @@ -0,0 +1,412 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssh + +import ( + "bytes" + "errors" + "fmt" + "strings" + "testing" +) + +func doClientServerAuth(t *testing.T, serverConfig *ServerConfig, clientConfig *ClientConfig) ([]error, error) { + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + + var serverAuthErrors []error + + serverConfig.AddHostKey(testSigners["rsa"]) + serverConfig.AuthLogCallback = func(conn ConnMetadata, method string, err error) { + serverAuthErrors = append(serverAuthErrors, err) + } + go newServer(c1, serverConfig) + c, _, _, err := NewClientConn(c2, "", clientConfig) + if err == nil { + c.Close() + } + return serverAuthErrors, err +} + +func TestMultiStepAuth(t *testing.T) { + // This user can login with password, public key or public key + password. + username := "testuser" + // This user can login with public key + password only. + usernameSecondFactor := "testuser_second_factor" + errPwdAuthFailed := errors.New("password auth failed") + errWrongSequence := errors.New("wrong sequence") + + serverConfig := &ServerConfig{ + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == usernameSecondFactor { + return nil, errWrongSequence + } + if conn.User() == username && string(password) == clientPassword { + return nil, nil + } + return nil, errPwdAuthFailed + }, + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + if bytes.Equal(key.Marshal(), testPublicKeys["rsa"].Marshal()) { + if conn.User() == usernameSecondFactor { + return nil, &PartialSuccessError{ + Next: ServerAuthCallbacks{ + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if string(password) == clientPassword { + return nil, nil + } + return nil, errPwdAuthFailed + }, + }, + } + } + return nil, nil + } + return nil, fmt.Errorf("pubkey for %q not acceptable", conn.User()) + }, + } + + clientConfig := &ClientConfig{ + User: usernameSecondFactor, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err := doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + + // The error sequence is: + // - no auth passed yet + // - partial success + // - nil + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatalf("expected partial success error, got: %v", serverAuthErrors[1]) + } + // Now test a wrong sequence. + clientConfig.Auth = []AuthMethod{ + Password(clientPassword), + PublicKeys(testSigners["rsa"]), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with wrong sequence must fail") + } + // The error sequence is: + // - no auth passed yet + // - wrong sequence + // - partial success + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if serverAuthErrors[1] != errWrongSequence { + t.Fatal("server not returned wrong sequence") + } + if _, ok := serverAuthErrors[2].(*PartialSuccessError); !ok { + t.Fatalf("expected partial success error, got: %v", serverAuthErrors[2]) + } + // Now test using a correct sequence but a wrong password before the right + // one. + n := 0 + passwords := []string{"WRONG", "WRONG", clientPassword} + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + RetryableAuthMethod(PasswordCallback(func() (string, error) { + p := passwords[n] + n++ + return p, nil + }), 3), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - no auth passed yet + // - partial success + // - wrong password + // - wrong password + // - nil + if len(serverAuthErrors) != 5 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + if serverAuthErrors[2] != errPwdAuthFailed { + t.Fatal("server not returned password authentication failed") + } + if serverAuthErrors[3] != errPwdAuthFailed { + t.Fatal("server not returned password authentication failed") + } + // Only password authentication should fail. + clientConfig.Auth = []AuthMethod{ + Password(clientPassword), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with password only must fail") + } + // The error sequence is: + // - no auth passed yet + // - wrong sequence + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if serverAuthErrors[1] != errWrongSequence { + t.Fatal("server not returned wrong sequence") + } + + // Only public key authentication should fail. + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with public key only must fail") + } + // The error sequence is: + // - no auth passed yet + // - partial success + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + // Public key and wrong password. + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + Password("WRONG"), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("client login with wrong password after public key must fail") + } + // The error sequence is: + // - no auth passed yet + // - partial success + // - password auth failed + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + if serverAuthErrors[2] != errPwdAuthFailed { + t.Fatal("server not returned password authentication failed") + } + + // Public key, public key again and then correct password. Public key + // authentication is attempted only once because the partial success error + // returns only "password" as the allowed authentication method. + clientConfig.Auth = []AuthMethod{ + PublicKeys(testSigners["rsa"]), + PublicKeys(testSigners["rsa"]), + Password(clientPassword), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - no auth passed yet + // - partial success + // - nil + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[1].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + // The unrestricted username can do anything + clientConfig = &ClientConfig{ + User: username, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("unrestricted client login error: %s", err) + } + + clientConfig = &ClientConfig{ + User: username, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("unrestricted client login error: %s", err) + } + + clientConfig = &ClientConfig{ + User: username, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("unrestricted client login error: %s", err) + } +} + +func TestDynamicAuthCallbacks(t *testing.T) { + user1 := "user1" + user2 := "user2" + errInvalidCredentials := errors.New("invalid credentials") + + serverConfig := &ServerConfig{ + NoClientAuth: true, + NoClientAuthCallback: func(conn ConnMetadata) (*Permissions, error) { + switch conn.User() { + case user1: + return nil, &PartialSuccessError{ + Next: ServerAuthCallbacks{ + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == user1 && string(password) == clientPassword { + return nil, nil + } + return nil, errInvalidCredentials + }, + }, + } + case user2: + return nil, &PartialSuccessError{ + Next: ServerAuthCallbacks{ + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + if bytes.Equal(key.Marshal(), testPublicKeys["rsa"].Marshal()) { + if conn.User() == user2 { + return nil, nil + } + } + return nil, errInvalidCredentials + }, + }, + } + default: + return nil, errInvalidCredentials + } + }, + } + + clientConfig := &ClientConfig{ + User: user1, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err := doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - partial success + // - nil + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + clientConfig = &ClientConfig{ + User: user2, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + // The error sequence is: + // - partial success + // - nil + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + + // user1 cannot login with public key + clientConfig = &ClientConfig{ + User: user1, + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("user1 login with public key must fail") + } + if !strings.Contains(err.Error(), "no supported methods remain") { + t.Errorf("got %v, expected 'no supported methods remain'", err) + } + if len(serverAuthErrors) != 1 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } + // user2 cannot login with password + clientConfig = &ClientConfig{ + User: user2, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err = doClientServerAuth(t, serverConfig, clientConfig) + if err == nil { + t.Fatal("user2 login with password must fail") + } + if !strings.Contains(err.Error(), "no supported methods remain") { + t.Errorf("got %v, expected 'no supported methods remain'", err) + } + if len(serverAuthErrors) != 1 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if _, ok := serverAuthErrors[0].(*PartialSuccessError); !ok { + t.Fatal("server not returned partial success") + } +} From b92bf9480d25521fc2b8965f86fe152bdfb43f28 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Wed, 3 Apr 2024 11:21:09 +0200 Subject: [PATCH 04/57] ssh: respect MaxAuthTries also for "none" auth attempts Only the first "none" auth attempt is allowed without penality Change-Id: Ibe776e968ba406445eeb94e8f1959383b88c98f7 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/575995 Reviewed-by: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Nicola Murino Commit-Queue: Nicola Murino Reviewed-by: Than McIntosh --- ssh/server.go | 6 ++- ssh/server_test.go | 129 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 2 deletions(-) diff --git a/ssh/server.go b/ssh/server.go index 92d73233ba..e2ae4f891b 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -468,6 +468,7 @@ func (s *connection) serverAuthenticate(config *ServerConfig) (*Permissions, err var perms *Permissions authFailures := 0 + noneAuthCount := 0 var authErrs []error var displayedBanner bool partialSuccessReturned := false @@ -534,6 +535,7 @@ userAuthLoop: switch userAuthReq.Method { case "none": + noneAuthCount++ // We don't allow none authentication after a partial success // response. if config.NoClientAuth && !partialSuccessReturned { @@ -755,7 +757,7 @@ userAuthLoop: failureMsg.PartialSuccess = true } else { // Allow initial attempt of 'none' without penalty. - if authFailures > 0 || userAuthReq.Method != "none" { + if authFailures > 0 || userAuthReq.Method != "none" || noneAuthCount != 1 { authFailures++ } if config.MaxAuthTries > 0 && authFailures >= config.MaxAuthTries { @@ -777,7 +779,7 @@ userAuthLoop: // disconnect, should we only send that message.) // // Either way, OpenSSH disconnects immediately after the last - // failed authnetication attempt, and given they are typically + // failed authentication attempt, and given they are typically // considered the golden implementation it seems reasonable // to match that behavior. continue diff --git a/ssh/server_test.go b/ssh/server_test.go index a9b2bce0c1..5b47b9e0af 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -5,8 +5,10 @@ package ssh import ( + "errors" "io" "net" + "strings" "sync/atomic" "testing" "time" @@ -62,6 +64,133 @@ func TestClientAuthRestrictedPublicKeyAlgos(t *testing.T) { } } +func TestMaxAuthTriesNoneMethod(t *testing.T) { + username := "testuser" + serverConfig := &ServerConfig{ + MaxAuthTries: 2, + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == username && string(password) == clientPassword { + return nil, nil + } + return nil, errors.New("invalid credentials") + }, + } + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + + var serverAuthErrors []error + + serverConfig.AddHostKey(testSigners["rsa"]) + serverConfig.AuthLogCallback = func(conn ConnMetadata, method string, err error) { + serverAuthErrors = append(serverAuthErrors, err) + } + go newServer(c1, serverConfig) + + clientConfig := ClientConfig{ + User: username, + HostKeyCallback: InsecureIgnoreHostKey(), + } + clientConfig.SetDefaults() + // Our client will send 'none' auth only once, so we need to send the + // requests manually. + c := &connection{ + sshConn: sshConn{ + conn: c2, + user: username, + clientVersion: []byte(packageVersion), + }, + } + c.serverVersion, err = exchangeVersions(c.sshConn.conn, c.clientVersion) + if err != nil { + t.Fatalf("unable to exchange version: %v", err) + } + c.transport = newClientTransport( + newTransport(c.sshConn.conn, clientConfig.Rand, true /* is client */), + c.clientVersion, c.serverVersion, &clientConfig, "", c.sshConn.RemoteAddr()) + if err := c.transport.waitSession(); err != nil { + t.Fatalf("unable to wait session: %v", err) + } + c.sessionID = c.transport.getSessionID() + if err := c.transport.writePacket(Marshal(&serviceRequestMsg{serviceUserAuth})); err != nil { + t.Fatalf("unable to send ssh-userauth message: %v", err) + } + packet, err := c.transport.readPacket() + if err != nil { + t.Fatal(err) + } + if len(packet) > 0 && packet[0] == msgExtInfo { + packet, err = c.transport.readPacket() + if err != nil { + t.Fatal(err) + } + } + var serviceAccept serviceAcceptMsg + if err := Unmarshal(packet, &serviceAccept); err != nil { + t.Fatal(err) + } + for i := 0; i <= serverConfig.MaxAuthTries; i++ { + auth := new(noneAuth) + _, _, err := auth.auth(c.sessionID, clientConfig.User, c.transport, clientConfig.Rand, nil) + if i < serverConfig.MaxAuthTries { + if err != nil { + t.Fatal(err) + } + continue + } + if err == nil { + t.Fatal("client: got no error") + } else if !strings.Contains(err.Error(), "too many authentication failures") { + t.Fatalf("client: got unexpected error: %v", err) + } + } + if len(serverAuthErrors) != 3 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + for _, err := range serverAuthErrors { + if !errors.Is(err, ErrNoAuth) { + t.Errorf("go error: %v; want: %v", err, ErrNoAuth) + } + } +} + +func TestMaxAuthTriesFirstNoneAuthErrorIgnored(t *testing.T) { + username := "testuser" + serverConfig := &ServerConfig{ + MaxAuthTries: 1, + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + if conn.User() == username && string(password) == clientPassword { + return nil, nil + } + return nil, errors.New("invalid credentials") + }, + } + clientConfig := &ClientConfig{ + User: username, + Auth: []AuthMethod{ + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + serverAuthErrors, err := doClientServerAuth(t, serverConfig, clientConfig) + if err != nil { + t.Fatalf("client login error: %s", err) + } + if len(serverAuthErrors) != 2 { + t.Fatalf("unexpected number of server auth errors: %v, errors: %+v", len(serverAuthErrors), serverAuthErrors) + } + if !errors.Is(serverAuthErrors[0], ErrNoAuth) { + t.Errorf("go error: %v; want: %v", serverAuthErrors[0], ErrNoAuth) + } + if serverAuthErrors[1] != nil { + t.Errorf("unexpected error: %v", serverAuthErrors[1]) + } +} + func TestNewServerConnValidationErrors(t *testing.T) { serverConf := &ServerConfig{ PublicKeyAuthAlgorithms: []string{CertAlgoRSAv01}, From d042a396a6de487c29b6907508ba7e86925f6e09 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Thu, 4 Apr 2024 16:13:36 +0000 Subject: [PATCH 05/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Ib20227810c7e72942dd6fc33731fc613784aedec Reviewed-on: https://go-review.googlesource.com/c/crypto/+/576515 Reviewed-by: Than McIntosh Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Auto-Submit: Gopher Robot --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 9dee0fe087..4a12d7a426 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.18.0 - golang.org/x/term v0.18.0 + golang.org/x/sys v0.19.0 + golang.org/x/term v0.19.0 ) require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index e8784c3e2e..fab3a4a74d 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= -golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= -golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= +golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 5defcc193aabc79299b09bc1e2e30445a3f78d4e Mon Sep 17 00:00:00 2001 From: Michael Munday Date: Sun, 14 Apr 2024 00:53:16 +0100 Subject: [PATCH 06/57] sha3: fix Sum results for SHAKE functions on s390x Sum was taking the digest from the state which is correct for SHA-3 functions but not for SHAKE functions. Updates golang/go#66804 Change-Id: If782464d773262075950e3168128c0d46e4a6530 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/578715 TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui Reviewed-by: Than McIntosh LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Run-TryBot: Michael Munday --- sha3/sha3_s390x.go | 19 +++++++++++++++++-- sha3/sha3_test.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go index d861bca528..b4fbbf8695 100644 --- a/sha3/sha3_s390x.go +++ b/sha3/sha3_s390x.go @@ -143,6 +143,12 @@ func (s *asmState) Write(b []byte) (int, error) { // Read squeezes an arbitrary number of bytes from the sponge. func (s *asmState) Read(out []byte) (n int, err error) { + // The 'compute last message digest' instruction only stores the digest + // at the first operand (dst) for SHAKE functions. + if s.function != shake_128 && s.function != shake_256 { + panic("sha3: can only call Read for SHAKE functions") + } + n = len(out) // need to pad if we were absorbing @@ -202,8 +208,17 @@ func (s *asmState) Sum(b []byte) []byte { // Hash the buffer. Note that we don't clear it because we // aren't updating the state. - klmd(s.function, &a, nil, s.buf) - return append(b, a[:s.outputLen]...) + switch s.function { + case sha3_224, sha3_256, sha3_384, sha3_512: + klmd(s.function, &a, nil, s.buf) + return append(b, a[:s.outputLen]...) + case shake_128, shake_256: + d := make([]byte, s.outputLen, 64) + klmd(s.function, &a, d, s.buf) + return append(b, d[:s.outputLen]...) + default: + panic("sha3: unknown function") + } } // Reset resets the Hash to its initial state. diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index 83bd6195d6..afcb722e7a 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -188,6 +188,34 @@ func TestKeccak(t *testing.T) { } } +// TestShakeSum tests that the output of Sum matches the output of Read. +func TestShakeSum(t *testing.T) { + tests := [...]struct { + name string + hash ShakeHash + expectedLen int + }{ + {"SHAKE128", NewShake128(), 32}, + {"SHAKE256", NewShake256(), 64}, + {"cSHAKE128", NewCShake128([]byte{'X'}, nil), 32}, + {"cSHAKE256", NewCShake256([]byte{'X'}, nil), 64}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := test.hash.Sum(nil) + if len(s) != test.expectedLen { + t.Errorf("Unexpected digest length: got %d, want %d", len(s), test.expectedLen) + } + r := make([]byte, test.expectedLen) + test.hash.Read(r) + if !bytes.Equal(s, r) { + t.Errorf("Mismatch between Sum and Read:\nSum: %s\nRead: %s", hex.EncodeToString(s), hex.EncodeToString(r)) + } + }) + } +} + // TestUnalignedWrite tests that writing data in an arbitrary pattern with // small input buffers. func TestUnalignedWrite(t *testing.T) { From 0da2a6a1bbc8e689a335bea68b5cc0e3e8728854 Mon Sep 17 00:00:00 2001 From: cuishuang Date: Tue, 16 Apr 2024 17:44:44 +0800 Subject: [PATCH 07/57] openpgp: fix function name in comment Change-Id: Ic788ebe311fafa0f5d9750d5f7f25fb70dc0606d Reviewed-on: https://go-review.googlesource.com/c/crypto/+/579175 Run-TryBot: shuang cui Auto-Submit: Ian Lance Taylor TryBot-Result: Gopher Robot Reviewed-by: Ian Lance Taylor LUCI-TryBot-Result: Go LUCI Reviewed-by: Cherry Mui --- openpgp/keys_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openpgp/keys_test.go b/openpgp/keys_test.go index 0eb1a9ef29..9631eb6408 100644 --- a/openpgp/keys_test.go +++ b/openpgp/keys_test.go @@ -132,7 +132,7 @@ func TestRevokedUserID(t *testing.T) { } } -// TestExternallyRevokableKey attempts to load and parse a key with a third party revocation permission. +// TestExternallyRevocableKey attempts to load and parse a key with a third party revocation permission. func TestExternallyRevocableKey(t *testing.T) { kring, err := ReadKeyRing(readerFromHex(subkeyUsageHex)) if err != nil { From ebb717d630028d3e29c90c55d73cb6de90d53c3e Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sat, 23 Mar 2024 12:10:24 +0100 Subject: [PATCH 08/57] ssh: validate key type in SSH_MSG_USERAUTH_PK_OK response According to RFC 4252 Section 7 the algorithm in SSH_MSG_USERAUTH_PK_OK should match that of the request but some servers send the key type instead. OpenSSH checks for the key type, so we do the same. Fixes golang/go#66438 Fixes golang/go#64785 Fixes golang/go#56342 Fixes golang/go#54027 Change-Id: I2f733f0faece097e44ba7a97c868d30a53e21d79 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/573360 Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI Run-TryBot: Nicola Murino Reviewed-by: Roland Shoemaker Reviewed-by: Filippo Valsorda TryBot-Result: Gopher Robot Reviewed-by: Joedian Reid --- ssh/client_auth.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ssh/client_auth.go b/ssh/client_auth.go index 34bf089d0b..9486c59862 100644 --- a/ssh/client_auth.go +++ b/ssh/client_auth.go @@ -404,10 +404,10 @@ func validateKey(key PublicKey, algo string, user string, c packetConn) (bool, e return false, err } - return confirmKeyAck(key, algo, c) + return confirmKeyAck(key, c) } -func confirmKeyAck(key PublicKey, algo string, c packetConn) (bool, error) { +func confirmKeyAck(key PublicKey, c packetConn) (bool, error) { pubKey := key.Marshal() for { @@ -425,7 +425,15 @@ func confirmKeyAck(key PublicKey, algo string, c packetConn) (bool, error) { if err := Unmarshal(packet, &msg); err != nil { return false, err } - if msg.Algo != algo || !bytes.Equal(msg.PubKey, pubKey) { + // According to RFC 4252 Section 7 the algorithm in + // SSH_MSG_USERAUTH_PK_OK should match that of the request but some + // servers send the key type instead. OpenSSH allows any algorithm + // that matches the public key, so we do the same. + // https://github.com/openssh/openssh-portable/blob/86bdd385/sshconnect2.c#L709 + if !contains(algorithmsForKeyFormat(key.Type()), msg.Algo) { + return false, nil + } + if !bytes.Equal(msg.PubKey, pubKey) { return false, nil } return true, nil From 905d78a692675acab06328af80cdfe0b681c8fc7 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Sun, 5 May 2024 13:11:57 +0000 Subject: [PATCH 09/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I19d5fc3e26b53fba06b4fbcf3817c44477265210 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/583355 Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Than McIntosh Reviewed-by: Dmitri Shuralyov --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 4a12d7a426..e5e51cb287 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.19.0 - golang.org/x/term v0.19.0 + golang.org/x/sys v0.20.0 + golang.org/x/term v0.20.0 ) -require golang.org/x/text v0.14.0 // indirect +require golang.org/x/text v0.15.0 // indirect diff --git a/go.sum b/go.sum index fab3a4a74d..ea7e57f729 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= -golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From 10f366e7a2b3254f25277b2c11f89b3f26fb8df1 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 24 Nov 2023 01:16:03 +0100 Subject: [PATCH 10/57] sha3: simplify XOR functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta PermutationFunction-4 398ns ± 0% 399ns ± 1% ~ (p=0.508 n=9+10) Sha3_512_MTU-4 8.34µs ± 1% 8.36µs ± 1% ~ (p=0.101 n=10+10) Sha3_384_MTU-4 6.00µs ± 0% 6.02µs ± 1% +0.47% (p=0.000 n=8+10) Sha3_256_MTU-4 4.78µs ± 0% 4.79µs ± 1% ~ (p=0.324 n=10+10) Sha3_224_MTU-4 4.57µs ± 1% 4.57µs ± 1% ~ (p=0.288 n=10+10) Shake128_MTU-4 3.87µs ± 0% 3.86µs ± 1% -0.22% (p=0.008 n=9+9) Shake256_MTU-4 4.17µs ± 0% 4.17µs ± 0% ~ (p=0.474 n=10+8) Shake256_16x-4 59.4µs ± 0% 59.7µs ± 0% +0.48% (p=0.000 n=9+8) Shake256_1MiB-4 3.19ms ± 1% 3.20ms ± 0% ~ (p=0.105 n=10+10) Sha3_512_1MiB-4 5.97ms ± 0% 6.01ms ± 0% +0.75% (p=0.000 n=10+10) name old speed new speed delta PermutationFunction-4 502MB/s ± 0% 502MB/s ± 0% ~ (p=0.497 n=9+10) Sha3_512_MTU-4 162MB/s ± 1% 161MB/s ± 1% ~ (p=0.101 n=10+10) Sha3_384_MTU-4 225MB/s ± 0% 224MB/s ± 1% -0.47% (p=0.000 n=8+10) Sha3_256_MTU-4 282MB/s ± 0% 282MB/s ± 1% ~ (p=0.325 n=10+10) Sha3_224_MTU-4 296MB/s ± 1% 295MB/s ± 1% ~ (p=0.280 n=10+10) Shake128_MTU-4 349MB/s ± 0% 350MB/s ± 1% +0.22% (p=0.008 n=9+9) Shake256_MTU-4 324MB/s ± 0% 324MB/s ± 0% ~ (p=0.459 n=10+8) Shake256_16x-4 276MB/s ± 0% 274MB/s ± 0% -0.48% (p=0.000 n=9+8) Shake256_1MiB-4 328MB/s ± 1% 327MB/s ± 0% ~ (p=0.105 n=10+10) Sha3_512_1MiB-4 176MB/s ± 0% 174MB/s ± 0% -0.74% (p=0.000 n=10+10) Change-Id: Ib8e571f3c9a0f84096df2f38ca96da197ad5be30 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/544815 Auto-Submit: Filippo Valsorda Reviewed-by: Roland Shoemaker Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Reviewed-by: Mauri de Souza Meneguzzo --- sha3/sha3.go | 20 +-- sha3/sha3_test.go | 295 ++++++++++++++++++++---------------------- sha3/xor.go | 45 +++++-- sha3/xor_generic.go | 28 ---- sha3/xor_unaligned.go | 66 ---------- 5 files changed, 178 insertions(+), 276 deletions(-) delete mode 100644 sha3/xor_generic.go delete mode 100644 sha3/xor_unaligned.go diff --git a/sha3/sha3.go b/sha3/sha3.go index 4884d172a4..33bd73b0f6 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -40,7 +40,7 @@ type state struct { // Extendable-Output Functions (May 2014)" dsbyte byte - storage storageBuf + storage [maxRate]byte // Specific to SHA-3 and SHAKE. outputLen int // the default output size in bytes @@ -61,15 +61,15 @@ func (d *state) Reset() { d.a[i] = 0 } d.state = spongeAbsorbing - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] } func (d *state) clone() *state { ret := *d if ret.state == spongeAbsorbing { - ret.buf = ret.storage.asBytes()[:len(ret.buf)] + ret.buf = ret.storage[:len(ret.buf)] } else { - ret.buf = ret.storage.asBytes()[d.rate-cap(d.buf) : d.rate] + ret.buf = ret.storage[d.rate-cap(d.buf) : d.rate] } return &ret @@ -83,13 +83,13 @@ func (d *state) permute() { // If we're absorbing, we need to xor the input into the state // before applying the permutation. xorIn(d, d.buf) - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] keccakF1600(&d.a) case spongeSqueezing: // If we're squeezing, we need to apply the permutation before // copying more output. keccakF1600(&d.a) - d.buf = d.storage.asBytes()[:d.rate] + d.buf = d.storage[:d.rate] copyOut(d, d.buf) } } @@ -98,7 +98,7 @@ func (d *state) permute() { // the multi-bitrate 10..1 padding rule, and permutes the state. func (d *state) padAndPermute(dsbyte byte) { if d.buf == nil { - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] } // Pad with this instance's domain-separator bits. We know that there's // at least one byte of space in d.buf because, if it were full, @@ -106,7 +106,7 @@ func (d *state) padAndPermute(dsbyte byte) { // first one bit for the padding. See the comment in the state struct. d.buf = append(d.buf, dsbyte) zerosStart := len(d.buf) - d.buf = d.storage.asBytes()[:d.rate] + d.buf = d.storage[:d.rate] for i := zerosStart; i < d.rate; i++ { d.buf[i] = 0 } @@ -117,7 +117,7 @@ func (d *state) padAndPermute(dsbyte byte) { // Apply the permutation d.permute() d.state = spongeSqueezing - d.buf = d.storage.asBytes()[:d.rate] + d.buf = d.storage[:d.rate] copyOut(d, d.buf) } @@ -128,7 +128,7 @@ func (d *state) Write(p []byte) (written int, err error) { panic("sha3: Write after Read") } if d.buf == nil { - d.buf = d.storage.asBytes()[:0] + d.buf = d.storage[:0] } written = len(p) diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index afcb722e7a..21e8cbad7b 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -76,86 +76,73 @@ type KeccakKats struct { } } -func testUnalignedAndGeneric(t *testing.T, testf func(impl string)) { - xorInOrig, copyOutOrig := xorIn, copyOut - xorIn, copyOut = xorInGeneric, copyOutGeneric - testf("generic") - if xorImplementationUnaligned != "generic" { - xorIn, copyOut = xorInUnaligned, copyOutUnaligned - testf("unaligned") - } - xorIn, copyOut = xorInOrig, copyOutOrig -} - // TestKeccakKats tests the SHA-3 and Shake implementations against all the // ShortMsgKATs from https://github.com/gvanas/KeccakCodePackage // (The testvectors are stored in keccakKats.json.deflate due to their length.) func TestKeccakKats(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - // Read the KATs. - deflated, err := os.Open(katFilename) - if err != nil { - t.Errorf("error opening %s: %s", katFilename, err) - } - file := flate.NewReader(deflated) - dec := json.NewDecoder(file) - var katSet KeccakKats - err = dec.Decode(&katSet) - if err != nil { - t.Errorf("error decoding KATs: %s", err) - } + // Read the KATs. + deflated, err := os.Open(katFilename) + if err != nil { + t.Errorf("error opening %s: %s", katFilename, err) + } + file := flate.NewReader(deflated) + dec := json.NewDecoder(file) + var katSet KeccakKats + err = dec.Decode(&katSet) + if err != nil { + t.Errorf("error decoding KATs: %s", err) + } - for algo, function := range testDigests { - d := function() - for _, kat := range katSet.Kats[algo] { - d.Reset() - in, err := hex.DecodeString(kat.Message) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } - d.Write(in[:kat.Length/8]) - got := strings.ToUpper(hex.EncodeToString(d.Sum(nil))) - if got != kat.Digest { - t.Errorf("function=%s, implementation=%s, length=%d\nmessage:\n %s\ngot:\n %s\nwanted:\n %s", - algo, impl, kat.Length, kat.Message, got, kat.Digest) - t.Logf("wanted %+v", kat) - t.FailNow() - } - continue + for algo, function := range testDigests { + d := function() + for _, kat := range katSet.Kats[algo] { + d.Reset() + in, err := hex.DecodeString(kat.Message) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } + d.Write(in[:kat.Length/8]) + got := strings.ToUpper(hex.EncodeToString(d.Sum(nil))) + if got != kat.Digest { + t.Errorf("function=%s, length=%d\nmessage:\n %s\ngot:\n %s\nwanted:\n %s", + algo, kat.Length, kat.Message, got, kat.Digest) + t.Logf("wanted %+v", kat) + t.FailNow() } + continue } + } - for algo, v := range testShakes { - for _, kat := range katSet.Kats[algo] { - N, err := hex.DecodeString(kat.N) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } + for algo, v := range testShakes { + for _, kat := range katSet.Kats[algo] { + N, err := hex.DecodeString(kat.N) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } - S, err := hex.DecodeString(kat.S) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } - d := v.constructor(N, S) - in, err := hex.DecodeString(kat.Message) - if err != nil { - t.Errorf("error decoding KAT: %s", err) - } + S, err := hex.DecodeString(kat.S) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } + d := v.constructor(N, S) + in, err := hex.DecodeString(kat.Message) + if err != nil { + t.Errorf("error decoding KAT: %s", err) + } - d.Write(in[:kat.Length/8]) - out := make([]byte, len(kat.Digest)/2) - d.Read(out) - got := strings.ToUpper(hex.EncodeToString(out)) - if got != kat.Digest { - t.Errorf("function=%s, implementation=%s, length=%d N:%s\n S:%s\nmessage:\n %s \ngot:\n %s\nwanted:\n %s", - algo, impl, kat.Length, kat.N, kat.S, kat.Message, got, kat.Digest) - t.Logf("wanted %+v", kat) - t.FailNow() - } - continue + d.Write(in[:kat.Length/8]) + out := make([]byte, len(kat.Digest)/2) + d.Read(out) + got := strings.ToUpper(hex.EncodeToString(out)) + if got != kat.Digest { + t.Errorf("function=%s, length=%d N:%s\n S:%s\nmessage:\n %s \ngot:\n %s\nwanted:\n %s", + algo, kat.Length, kat.N, kat.S, kat.Message, got, kat.Digest) + t.Logf("wanted %+v", kat) + t.FailNow() } + continue } - }) + } } // TestKeccak does a basic test of the non-standardized Keccak hash functions. @@ -219,119 +206,111 @@ func TestShakeSum(t *testing.T) { // TestUnalignedWrite tests that writing data in an arbitrary pattern with // small input buffers. func TestUnalignedWrite(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - buf := sequentialBytes(0x10000) - for alg, df := range testDigests { - d := df() - d.Reset() - d.Write(buf) - want := d.Sum(nil) - d.Reset() - for i := 0; i < len(buf); { - // Cycle through offsets which make a 137 byte sequence. - // Because 137 is prime this sequence should exercise all corner cases. - offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} - for _, j := range offsets { - if v := len(buf) - i; v < j { - j = v - } - d.Write(buf[i : i+j]) - i += j + buf := sequentialBytes(0x10000) + for alg, df := range testDigests { + d := df() + d.Reset() + d.Write(buf) + want := d.Sum(nil) + d.Reset() + for i := 0; i < len(buf); { + // Cycle through offsets which make a 137 byte sequence. + // Because 137 is prime this sequence should exercise all corner cases. + offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} + for _, j := range offsets { + if v := len(buf) - i; v < j { + j = v } - } - got := d.Sum(nil) - if !bytes.Equal(got, want) { - t.Errorf("Unaligned writes, implementation=%s, alg=%s\ngot %q, want %q", impl, alg, got, want) + d.Write(buf[i : i+j]) + i += j } } + got := d.Sum(nil) + if !bytes.Equal(got, want) { + t.Errorf("Unaligned writes, alg=%s\ngot %q, want %q", alg, got, want) + } + } - // Same for SHAKE - for alg, df := range testShakes { - want := make([]byte, 16) - got := make([]byte, 16) - d := df.constructor([]byte(df.defAlgoName), []byte(df.defCustomStr)) - - d.Reset() - d.Write(buf) - d.Read(want) - d.Reset() - for i := 0; i < len(buf); { - // Cycle through offsets which make a 137 byte sequence. - // Because 137 is prime this sequence should exercise all corner cases. - offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} - for _, j := range offsets { - if v := len(buf) - i; v < j { - j = v - } - d.Write(buf[i : i+j]) - i += j + // Same for SHAKE + for alg, df := range testShakes { + want := make([]byte, 16) + got := make([]byte, 16) + d := df.constructor([]byte(df.defAlgoName), []byte(df.defCustomStr)) + + d.Reset() + d.Write(buf) + d.Read(want) + d.Reset() + for i := 0; i < len(buf); { + // Cycle through offsets which make a 137 byte sequence. + // Because 137 is prime this sequence should exercise all corner cases. + offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1} + for _, j := range offsets { + if v := len(buf) - i; v < j { + j = v } - } - d.Read(got) - if !bytes.Equal(got, want) { - t.Errorf("Unaligned writes, implementation=%s, alg=%s\ngot %q, want %q", impl, alg, got, want) + d.Write(buf[i : i+j]) + i += j } } - }) + d.Read(got) + if !bytes.Equal(got, want) { + t.Errorf("Unaligned writes, alg=%s\ngot %q, want %q", alg, got, want) + } + } } // TestAppend checks that appending works when reallocation is necessary. func TestAppend(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - d := New224() + d := New224() - for capacity := 2; capacity <= 66; capacity += 64 { - // The first time around the loop, Sum will have to reallocate. - // The second time, it will not. - buf := make([]byte, 2, capacity) - d.Reset() - d.Write([]byte{0xcc}) - buf = d.Sum(buf) - expected := "0000DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" - if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { - t.Errorf("got %s, want %s", got, expected) - } + for capacity := 2; capacity <= 66; capacity += 64 { + // The first time around the loop, Sum will have to reallocate. + // The second time, it will not. + buf := make([]byte, 2, capacity) + d.Reset() + d.Write([]byte{0xcc}) + buf = d.Sum(buf) + expected := "0000DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" + if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { + t.Errorf("got %s, want %s", got, expected) } - }) + } } // TestAppendNoRealloc tests that appending works when no reallocation is necessary. func TestAppendNoRealloc(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - buf := make([]byte, 1, 200) - d := New224() - d.Write([]byte{0xcc}) - buf = d.Sum(buf) - expected := "00DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" - if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { - t.Errorf("%s: got %s, want %s", impl, got, expected) - } - }) + buf := make([]byte, 1, 200) + d := New224() + d.Write([]byte{0xcc}) + buf = d.Sum(buf) + expected := "00DF70ADC49B2E76EEE3A6931B93FA41841C3AF2CDF5B32A18B5478C39" + if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected { + t.Errorf("got %s, want %s", got, expected) + } } // TestSqueezing checks that squeezing the full output a single time produces // the same output as repeatedly squeezing the instance. func TestSqueezing(t *testing.T) { - testUnalignedAndGeneric(t, func(impl string) { - for algo, v := range testShakes { - d0 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) - d0.Write([]byte(testString)) - ref := make([]byte, 32) - d0.Read(ref) - - d1 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) - d1.Write([]byte(testString)) - var multiple []byte - for range ref { - one := make([]byte, 1) - d1.Read(one) - multiple = append(multiple, one...) - } - if !bytes.Equal(ref, multiple) { - t.Errorf("%s (%s): squeezing %d bytes one at a time failed", algo, impl, len(ref)) - } + for algo, v := range testShakes { + d0 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) + d0.Write([]byte(testString)) + ref := make([]byte, 32) + d0.Read(ref) + + d1 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr)) + d1.Write([]byte(testString)) + var multiple []byte + for range ref { + one := make([]byte, 1) + d1.Read(one) + multiple = append(multiple, one...) + } + if !bytes.Equal(ref, multiple) { + t.Errorf("%s: squeezing %d bytes one at a time failed", algo, len(ref)) } - }) + } } // sequentialBytes produces a buffer of size consecutive bytes 0x00, 0x01, ..., used for testing. diff --git a/sha3/xor.go b/sha3/xor.go index 7337cca88e..6ada5c9574 100644 --- a/sha3/xor.go +++ b/sha3/xor.go @@ -2,22 +2,39 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !386 && !ppc64le) || purego - package sha3 -// A storageBuf is an aligned array of maxRate bytes. -type storageBuf [maxRate]byte - -func (b *storageBuf) asBytes() *[maxRate]byte { - return (*[maxRate]byte)(b) -} +import ( + "crypto/subtle" + "encoding/binary" + "unsafe" -var ( - xorIn = xorInGeneric - copyOut = copyOutGeneric - xorInUnaligned = xorInGeneric - copyOutUnaligned = copyOutGeneric + "golang.org/x/sys/cpu" ) -const xorImplementationUnaligned = "generic" +// xorIn xors the bytes in buf into the state. +func xorIn(d *state, buf []byte) { + if cpu.IsBigEndian { + for i := 0; len(buf) >= 8; i++ { + a := binary.LittleEndian.Uint64(buf) + d.a[i] ^= a + buf = buf[8:] + } + } else { + ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) + subtle.XORBytes(ab[:], ab[:], buf) + } +} + +// copyOut copies uint64s to a byte buffer. +func copyOut(d *state, b []byte) { + if cpu.IsBigEndian { + for i := 0; len(b) >= 8; i++ { + binary.LittleEndian.PutUint64(b, d.a[i]) + b = b[8:] + } + } else { + ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) + copy(b, ab[:]) + } +} diff --git a/sha3/xor_generic.go b/sha3/xor_generic.go deleted file mode 100644 index 8d94771127..0000000000 --- a/sha3/xor_generic.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package sha3 - -import "encoding/binary" - -// xorInGeneric xors the bytes in buf into the state; it -// makes no non-portable assumptions about memory layout -// or alignment. -func xorInGeneric(d *state, buf []byte) { - n := len(buf) / 8 - - for i := 0; i < n; i++ { - a := binary.LittleEndian.Uint64(buf) - d.a[i] ^= a - buf = buf[8:] - } -} - -// copyOutGeneric copies uint64s to a byte buffer. -func copyOutGeneric(d *state, b []byte) { - for i := 0; len(b) >= 8; i++ { - binary.LittleEndian.PutUint64(b, d.a[i]) - b = b[8:] - } -} diff --git a/sha3/xor_unaligned.go b/sha3/xor_unaligned.go deleted file mode 100644 index 870e2d16e0..0000000000 --- a/sha3/xor_unaligned.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (amd64 || 386 || ppc64le) && !purego - -package sha3 - -import "unsafe" - -// A storageBuf is an aligned array of maxRate bytes. -type storageBuf [maxRate / 8]uint64 - -func (b *storageBuf) asBytes() *[maxRate]byte { - return (*[maxRate]byte)(unsafe.Pointer(b)) -} - -// xorInUnaligned uses unaligned reads and writes to update d.a to contain d.a -// XOR buf. -func xorInUnaligned(d *state, buf []byte) { - n := len(buf) - bw := (*[maxRate / 8]uint64)(unsafe.Pointer(&buf[0]))[: n/8 : n/8] - if n >= 72 { - d.a[0] ^= bw[0] - d.a[1] ^= bw[1] - d.a[2] ^= bw[2] - d.a[3] ^= bw[3] - d.a[4] ^= bw[4] - d.a[5] ^= bw[5] - d.a[6] ^= bw[6] - d.a[7] ^= bw[7] - d.a[8] ^= bw[8] - } - if n >= 104 { - d.a[9] ^= bw[9] - d.a[10] ^= bw[10] - d.a[11] ^= bw[11] - d.a[12] ^= bw[12] - } - if n >= 136 { - d.a[13] ^= bw[13] - d.a[14] ^= bw[14] - d.a[15] ^= bw[15] - d.a[16] ^= bw[16] - } - if n >= 144 { - d.a[17] ^= bw[17] - } - if n >= 168 { - d.a[18] ^= bw[18] - d.a[19] ^= bw[19] - d.a[20] ^= bw[20] - } -} - -func copyOutUnaligned(d *state, buf []byte) { - ab := (*[maxRate]uint8)(unsafe.Pointer(&d.a[0])) - copy(buf, ab[:]) -} - -var ( - xorIn = xorInUnaligned - copyOut = copyOutUnaligned -) - -const xorImplementationUnaligned = "unaligned" From 59b5a86796b9d310b31d416f56d93b5ce30da22b Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 24 Nov 2023 01:20:48 +0100 Subject: [PATCH 11/57] sha3: disable s390x assembly It was integrated in such a way that it made devirtualization impossible, leading to allocations on every platform. It can be reintroduced according to AssemblyPolicy and TargetSpecific. Updates #64897 Change-Id: I3a4edc91185c2928b2c9b80655a2bc8daa6b44e3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/544816 Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Mauri de Souza Meneguzzo Auto-Submit: Filippo Valsorda Reviewed-by: Cherry Mui --- sha3/hashes.go | 12 ------------ sha3/hashes_generic.go | 27 --------------------------- sha3/sha3_s390x.go | 2 +- sha3/sha3_s390x.s | 2 +- sha3/shake.go | 6 ------ sha3/shake_generic.go | 19 ------------------- 6 files changed, 2 insertions(+), 66 deletions(-) delete mode 100644 sha3/hashes_generic.go delete mode 100644 sha3/shake_generic.go diff --git a/sha3/hashes.go b/sha3/hashes.go index 0d8043fd2a..1e815c9c7a 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -16,9 +16,6 @@ import ( // Its generic security strength is 224 bits against preimage attacks, // and 112 bits against collision attacks. func New224() hash.Hash { - if h := new224Asm(); h != nil { - return h - } return &state{rate: 144, outputLen: 28, dsbyte: 0x06} } @@ -26,9 +23,6 @@ func New224() hash.Hash { // Its generic security strength is 256 bits against preimage attacks, // and 128 bits against collision attacks. func New256() hash.Hash { - if h := new256Asm(); h != nil { - return h - } return &state{rate: 136, outputLen: 32, dsbyte: 0x06} } @@ -36,9 +30,6 @@ func New256() hash.Hash { // Its generic security strength is 384 bits against preimage attacks, // and 192 bits against collision attacks. func New384() hash.Hash { - if h := new384Asm(); h != nil { - return h - } return &state{rate: 104, outputLen: 48, dsbyte: 0x06} } @@ -46,9 +37,6 @@ func New384() hash.Hash { // Its generic security strength is 512 bits against preimage attacks, // and 256 bits against collision attacks. func New512() hash.Hash { - if h := new512Asm(); h != nil { - return h - } return &state{rate: 72, outputLen: 64, dsbyte: 0x06} } diff --git a/sha3/hashes_generic.go b/sha3/hashes_generic.go deleted file mode 100644 index fe8c84793c..0000000000 --- a/sha3/hashes_generic.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !gc || purego || !s390x - -package sha3 - -import ( - "hash" -) - -// new224Asm returns an assembly implementation of SHA3-224 if available, -// otherwise it returns nil. -func new224Asm() hash.Hash { return nil } - -// new256Asm returns an assembly implementation of SHA3-256 if available, -// otherwise it returns nil. -func new256Asm() hash.Hash { return nil } - -// new384Asm returns an assembly implementation of SHA3-384 if available, -// otherwise it returns nil. -func new384Asm() hash.Hash { return nil } - -// new512Asm returns an assembly implementation of SHA3-512 if available, -// otherwise it returns nil. -func new512Asm() hash.Hash { return nil } diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go index b4fbbf8695..26b728b836 100644 --- a/sha3/sha3_s390x.go +++ b/sha3/sha3_s390x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && ignore package sha3 diff --git a/sha3/sha3_s390x.s b/sha3/sha3_s390x.s index 826b862c77..df51683097 100644 --- a/sha3/sha3_s390x.s +++ b/sha3/sha3_s390x.s @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && ignore #include "textflag.h" diff --git a/sha3/shake.go b/sha3/shake.go index bb69984027..a31bcf898c 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -115,9 +115,6 @@ func (c *state) Clone() ShakeHash { // Its generic security strength is 128 bits against all attacks if at // least 32 bytes of its output are used. func NewShake128() ShakeHash { - if h := newShake128Asm(); h != nil { - return h - } return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} } @@ -125,9 +122,6 @@ func NewShake128() ShakeHash { // Its generic security strength is 256 bits against all attacks if // at least 64 bytes of its output are used. func NewShake256() ShakeHash { - if h := newShake256Asm(); h != nil { - return h - } return &state{rate: rate256, outputLen: 64, dsbyte: dsbyteShake} } diff --git a/sha3/shake_generic.go b/sha3/shake_generic.go deleted file mode 100644 index 8d31cf5be2..0000000000 --- a/sha3/shake_generic.go +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !gc || purego || !s390x - -package sha3 - -// newShake128Asm returns an assembly implementation of SHAKE-128 if available, -// otherwise it returns nil. -func newShake128Asm() ShakeHash { - return nil -} - -// newShake256Asm returns an assembly implementation of SHAKE-256 if available, -// otherwise it returns nil. -func newShake256Asm() ShakeHash { - return nil -} From 477a5b4c327a4fea3cab2fe127f89940289b65e5 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 24 Nov 2023 01:25:35 +0100 Subject: [PATCH 12/57] sha3: make APIs usable with zero allocations The "buf points into storage" pattern is nice, but causes the whole state struct to escape, since escape analysis can't track the pointer once it's assigned to buf. Change-Id: I31c0e83f946d66bedb5a180e96ab5d5e936eb322 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/544817 Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker Reviewed-by: Mauri de Souza Meneguzzo Auto-Submit: Filippo Valsorda --- sha3/allocations_test.go | 53 +++++++++++++++++++++++++++++++++++ sha3/sha3.go | 60 ++++++++++++++++------------------------ 2 files changed, 77 insertions(+), 36 deletions(-) create mode 100644 sha3/allocations_test.go diff --git a/sha3/allocations_test.go b/sha3/allocations_test.go new file mode 100644 index 0000000000..c925099304 --- /dev/null +++ b/sha3/allocations_test.go @@ -0,0 +1,53 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !noopt + +package sha3_test + +import ( + "testing" + + "golang.org/x/crypto/sha3" +) + +var sink byte + +func TestAllocations(t *testing.T) { + t.Run("New", func(t *testing.T) { + if allocs := testing.AllocsPerRun(10, func() { + h := sha3.New256() + b := []byte("ABC") + h.Write(b) + out := make([]byte, 0, 32) + out = h.Sum(out) + sink ^= out[0] + }); allocs > 0 { + t.Errorf("expected zero allocations, got %0.1f", allocs) + } + }) + t.Run("NewShake", func(t *testing.T) { + if allocs := testing.AllocsPerRun(10, func() { + h := sha3.NewShake128() + b := []byte("ABC") + h.Write(b) + out := make([]byte, 0, 32) + out = h.Sum(out) + sink ^= out[0] + h.Read(out) + sink ^= out[0] + }); allocs > 0 { + t.Errorf("expected zero allocations, got %0.1f", allocs) + } + }) + t.Run("Sum", func(t *testing.T) { + if allocs := testing.AllocsPerRun(10, func() { + b := []byte("ABC") + out := sha3.Sum256(b) + sink ^= out[0] + }); allocs > 0 { + t.Errorf("expected zero allocations, got %0.1f", allocs) + } + }) +} diff --git a/sha3/sha3.go b/sha3/sha3.go index 33bd73b0f6..afedde5abf 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -23,7 +23,6 @@ const ( type state struct { // Generic sponge components. a [25]uint64 // main state of the hash - buf []byte // points into storage rate int // the number of bytes of state to use // dsbyte contains the "domain separation" bits and the first bit of @@ -40,6 +39,7 @@ type state struct { // Extendable-Output Functions (May 2014)" dsbyte byte + i, n int // storage[i:n] is the buffer, i is only used while squeezing storage [maxRate]byte // Specific to SHA-3 and SHAKE. @@ -54,24 +54,18 @@ func (d *state) BlockSize() int { return d.rate } func (d *state) Size() int { return d.outputLen } // Reset clears the internal state by zeroing the sponge state and -// the byte buffer, and setting Sponge.state to absorbing. +// the buffer indexes, and setting Sponge.state to absorbing. func (d *state) Reset() { // Zero the permutation's state. for i := range d.a { d.a[i] = 0 } d.state = spongeAbsorbing - d.buf = d.storage[:0] + d.i, d.n = 0, 0 } func (d *state) clone() *state { ret := *d - if ret.state == spongeAbsorbing { - ret.buf = ret.storage[:len(ret.buf)] - } else { - ret.buf = ret.storage[d.rate-cap(d.buf) : d.rate] - } - return &ret } @@ -82,43 +76,40 @@ func (d *state) permute() { case spongeAbsorbing: // If we're absorbing, we need to xor the input into the state // before applying the permutation. - xorIn(d, d.buf) - d.buf = d.storage[:0] + xorIn(d, d.storage[:d.rate]) + d.n = 0 keccakF1600(&d.a) case spongeSqueezing: // If we're squeezing, we need to apply the permutation before // copying more output. keccakF1600(&d.a) - d.buf = d.storage[:d.rate] - copyOut(d, d.buf) + d.i = 0 + copyOut(d, d.storage[:d.rate]) } } // pads appends the domain separation bits in dsbyte, applies // the multi-bitrate 10..1 padding rule, and permutes the state. -func (d *state) padAndPermute(dsbyte byte) { - if d.buf == nil { - d.buf = d.storage[:0] - } +func (d *state) padAndPermute() { // Pad with this instance's domain-separator bits. We know that there's // at least one byte of space in d.buf because, if it were full, // permute would have been called to empty it. dsbyte also contains the // first one bit for the padding. See the comment in the state struct. - d.buf = append(d.buf, dsbyte) - zerosStart := len(d.buf) - d.buf = d.storage[:d.rate] - for i := zerosStart; i < d.rate; i++ { - d.buf[i] = 0 + d.storage[d.n] = d.dsbyte + d.n++ + for d.n < d.rate { + d.storage[d.n] = 0 + d.n++ } // This adds the final one bit for the padding. Because of the way that // bits are numbered from the LSB upwards, the final bit is the MSB of // the last byte. - d.buf[d.rate-1] ^= 0x80 + d.storage[d.rate-1] ^= 0x80 // Apply the permutation d.permute() d.state = spongeSqueezing - d.buf = d.storage[:d.rate] - copyOut(d, d.buf) + d.n = d.rate + copyOut(d, d.storage[:d.rate]) } // Write absorbs more data into the hash's state. It panics if any @@ -127,28 +118,25 @@ func (d *state) Write(p []byte) (written int, err error) { if d.state != spongeAbsorbing { panic("sha3: Write after Read") } - if d.buf == nil { - d.buf = d.storage[:0] - } written = len(p) for len(p) > 0 { - if len(d.buf) == 0 && len(p) >= d.rate { + if d.n == 0 && len(p) >= d.rate { // The fast path; absorb a full "rate" bytes of input and apply the permutation. xorIn(d, p[:d.rate]) p = p[d.rate:] keccakF1600(&d.a) } else { // The slow path; buffer the input until we can fill the sponge, and then xor it in. - todo := d.rate - len(d.buf) + todo := d.rate - d.n if todo > len(p) { todo = len(p) } - d.buf = append(d.buf, p[:todo]...) + d.n += copy(d.storage[d.n:], p[:todo]) p = p[todo:] // If the sponge is full, apply the permutation. - if len(d.buf) == d.rate { + if d.n == d.rate { d.permute() } } @@ -161,19 +149,19 @@ func (d *state) Write(p []byte) (written int, err error) { func (d *state) Read(out []byte) (n int, err error) { // If we're still absorbing, pad and apply the permutation. if d.state == spongeAbsorbing { - d.padAndPermute(d.dsbyte) + d.padAndPermute() } n = len(out) // Now, do the squeezing. for len(out) > 0 { - n := copy(out, d.buf) - d.buf = d.buf[n:] + n := copy(out, d.storage[d.i:d.n]) + d.i += n out = out[n:] // Apply the permutation if we've squeezed the sponge dry. - if len(d.buf) == 0 { + if d.i == d.rate { d.permute() } } From 67b13616a59528f2f948f405d79d6e7df0b97d12 Mon Sep 17 00:00:00 2001 From: Mauri de Souza Meneguzzo Date: Sat, 6 Jan 2024 18:52:35 +0000 Subject: [PATCH 13/57] sha3: reenable s390x assembly Fixes golang/go#64897 Change-Id: I0c8c52d73a7d2df0f44fee36d407a87213f59bff Reviewed-on: https://go-review.googlesource.com/c/crypto/+/554435 TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI --- sha3/allocations_test.go | 14 ++++++++--- sha3/hashes.go | 22 +++++++++++++++--- sha3/hashes_noasm.go | 23 ++++++++++++++++++ sha3/sha3_s390x.go | 50 ++++++++++++++++++++-------------------- sha3/sha3_s390x.s | 2 +- sha3/shake.go | 10 +++++++- sha3/shake_noasm.go | 15 ++++++++++++ 7 files changed, 103 insertions(+), 33 deletions(-) create mode 100644 sha3/hashes_noasm.go create mode 100644 sha3/shake_noasm.go diff --git a/sha3/allocations_test.go b/sha3/allocations_test.go index c925099304..36de5d547e 100644 --- a/sha3/allocations_test.go +++ b/sha3/allocations_test.go @@ -7,6 +7,7 @@ package sha3_test import ( + "runtime" "testing" "golang.org/x/crypto/sha3" @@ -15,6 +16,13 @@ import ( var sink byte func TestAllocations(t *testing.T) { + want := 0.0 + + if runtime.GOARCH == "s390x" { + // On s390x the returned hash.Hash is conditional so it escapes. + want = 3.0 + } + t.Run("New", func(t *testing.T) { if allocs := testing.AllocsPerRun(10, func() { h := sha3.New256() @@ -23,7 +31,7 @@ func TestAllocations(t *testing.T) { out := make([]byte, 0, 32) out = h.Sum(out) sink ^= out[0] - }); allocs > 0 { + }); allocs > want { t.Errorf("expected zero allocations, got %0.1f", allocs) } }) @@ -37,7 +45,7 @@ func TestAllocations(t *testing.T) { sink ^= out[0] h.Read(out) sink ^= out[0] - }); allocs > 0 { + }); allocs > want { t.Errorf("expected zero allocations, got %0.1f", allocs) } }) @@ -46,7 +54,7 @@ func TestAllocations(t *testing.T) { b := []byte("ABC") out := sha3.Sum256(b) sink ^= out[0] - }); allocs > 0 { + }); allocs > want { t.Errorf("expected zero allocations, got %0.1f", allocs) } }) diff --git a/sha3/hashes.go b/sha3/hashes.go index 1e815c9c7a..5eae6cb922 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -16,27 +16,43 @@ import ( // Its generic security strength is 224 bits against preimage attacks, // and 112 bits against collision attacks. func New224() hash.Hash { - return &state{rate: 144, outputLen: 28, dsbyte: 0x06} + return new224() } // New256 creates a new SHA3-256 hash. // Its generic security strength is 256 bits against preimage attacks, // and 128 bits against collision attacks. func New256() hash.Hash { - return &state{rate: 136, outputLen: 32, dsbyte: 0x06} + return new256() } // New384 creates a new SHA3-384 hash. // Its generic security strength is 384 bits against preimage attacks, // and 192 bits against collision attacks. func New384() hash.Hash { - return &state{rate: 104, outputLen: 48, dsbyte: 0x06} + return new384() } // New512 creates a new SHA3-512 hash. // Its generic security strength is 512 bits against preimage attacks, // and 256 bits against collision attacks. func New512() hash.Hash { + return new512() +} + +func new224Generic() *state { + return &state{rate: 144, outputLen: 28, dsbyte: 0x06} +} + +func new256Generic() *state { + return &state{rate: 136, outputLen: 32, dsbyte: 0x06} +} + +func new384Generic() *state { + return &state{rate: 104, outputLen: 48, dsbyte: 0x06} +} + +func new512Generic() *state { return &state{rate: 72, outputLen: 64, dsbyte: 0x06} } diff --git a/sha3/hashes_noasm.go b/sha3/hashes_noasm.go new file mode 100644 index 0000000000..9d85fb6214 --- /dev/null +++ b/sha3/hashes_noasm.go @@ -0,0 +1,23 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !gc || purego || !s390x + +package sha3 + +func new224() *state { + return new224Generic() +} + +func new256() *state { + return new256Generic() +} + +func new384() *state { + return new384Generic() +} + +func new512() *state { + return new512Generic() +} diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go index 26b728b836..00d8034ae6 100644 --- a/sha3/sha3_s390x.go +++ b/sha3/sha3_s390x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego && ignore +//go:build gc && !purego package sha3 @@ -248,56 +248,56 @@ func (s *asmState) Clone() ShakeHash { return s.clone() } -// new224Asm returns an assembly implementation of SHA3-224 if available, -// otherwise it returns nil. -func new224Asm() hash.Hash { +// new224 returns an assembly implementation of SHA3-224 if available, +// otherwise it returns a generic implementation. +func new224() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_224) } - return nil + return new224Generic() } -// new256Asm returns an assembly implementation of SHA3-256 if available, -// otherwise it returns nil. -func new256Asm() hash.Hash { +// new256 returns an assembly implementation of SHA3-256 if available, +// otherwise it returns a generic implementation. +func new256() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_256) } - return nil + return new256Generic() } -// new384Asm returns an assembly implementation of SHA3-384 if available, -// otherwise it returns nil. -func new384Asm() hash.Hash { +// new384 returns an assembly implementation of SHA3-384 if available, +// otherwise it returns a generic implementation. +func new384() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_384) } - return nil + return new384Generic() } -// new512Asm returns an assembly implementation of SHA3-512 if available, -// otherwise it returns nil. -func new512Asm() hash.Hash { +// new512 returns an assembly implementation of SHA3-512 if available, +// otherwise it returns a generic implementation. +func new512() hash.Hash { if cpu.S390X.HasSHA3 { return newAsmState(sha3_512) } - return nil + return new512Generic() } -// newShake128Asm returns an assembly implementation of SHAKE-128 if available, -// otherwise it returns nil. -func newShake128Asm() ShakeHash { +// newShake128 returns an assembly implementation of SHAKE-128 if available, +// otherwise it returns a generic implementation. +func newShake128() ShakeHash { if cpu.S390X.HasSHA3 { return newAsmState(shake_128) } - return nil + return newShake128Generic() } -// newShake256Asm returns an assembly implementation of SHAKE-256 if available, -// otherwise it returns nil. -func newShake256Asm() ShakeHash { +// newShake256 returns an assembly implementation of SHAKE-256 if available, +// otherwise it returns a generic implementation. +func newShake256() ShakeHash { if cpu.S390X.HasSHA3 { return newAsmState(shake_256) } - return nil + return newShake256Generic() } diff --git a/sha3/sha3_s390x.s b/sha3/sha3_s390x.s index df51683097..826b862c77 100644 --- a/sha3/sha3_s390x.s +++ b/sha3/sha3_s390x.s @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego && ignore +//go:build gc && !purego #include "textflag.h" diff --git a/sha3/shake.go b/sha3/shake.go index a31bcf898c..1ea9275b8b 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -115,13 +115,21 @@ func (c *state) Clone() ShakeHash { // Its generic security strength is 128 bits against all attacks if at // least 32 bytes of its output are used. func NewShake128() ShakeHash { - return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} + return newShake128() } // NewShake256 creates a new SHAKE256 variable-output-length ShakeHash. // Its generic security strength is 256 bits against all attacks if // at least 64 bytes of its output are used. func NewShake256() ShakeHash { + return newShake256() +} + +func newShake128Generic() *state { + return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} +} + +func newShake256Generic() *state { return &state{rate: rate256, outputLen: 64, dsbyte: dsbyteShake} } diff --git a/sha3/shake_noasm.go b/sha3/shake_noasm.go new file mode 100644 index 0000000000..4276ba4ab2 --- /dev/null +++ b/sha3/shake_noasm.go @@ -0,0 +1,15 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !gc || purego || !s390x + +package sha3 + +func newShake128() *state { + return newShake128Generic() +} + +func newShake256() *state { + return newShake256Generic() +} From 44c9b0ff9e71f015c49f686c68a7950fac76623c Mon Sep 17 00:00:00 2001 From: Andrew Lytvynov Date: Thu, 25 Jan 2024 18:32:22 -0700 Subject: [PATCH 14/57] ssh: allow server auth callbacks to send additional banners Add a new BannerError error type that auth callbacks can return to send banner to the client. While the BannerCallback can send the initial banner message, auth callbacks might want to communicate more information to the client to help them diagnose failures. Updates golang/go#64962 Change-Id: I97a26480ff4064b95a0a26042b0a5e19737cfb62 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/558695 LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker Reviewed-by: Nicola Murino Auto-Submit: Nicola Murino Reviewed-by: Dmitri Shuralyov --- ssh/server.go | 30 +++++++++++++++++++ ssh/server_test.go | 74 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/ssh/server.go b/ssh/server.go index e2ae4f891b..3ca9e89e22 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -462,6 +462,24 @@ func (p *PartialSuccessError) Error() string { // It is returned in ServerAuthError.Errors from NewServerConn. var ErrNoAuth = errors.New("ssh: no auth passed yet") +// BannerError is an error that can be returned by authentication handlers in +// ServerConfig to send a banner message to the client. +type BannerError struct { + Err error + Message string +} + +func (b *BannerError) Unwrap() error { + return b.Err +} + +func (b *BannerError) Error() string { + if b.Err == nil { + return b.Message + } + return b.Err.Error() +} + func (s *connection) serverAuthenticate(config *ServerConfig) (*Permissions, error) { sessionID := s.transport.getSessionID() var cache pubKeyCache @@ -734,6 +752,18 @@ userAuthLoop: config.AuthLogCallback(s, userAuthReq.Method, authErr) } + var bannerErr *BannerError + if errors.As(authErr, &bannerErr) { + if bannerErr.Message != "" { + bannerMsg := &userAuthBannerMsg{ + Message: bannerErr.Message, + } + if err := s.transport.writePacket(Marshal(bannerMsg)); err != nil { + return nil, err + } + } + } + if authErr == nil { break userAuthLoop } diff --git a/ssh/server_test.go b/ssh/server_test.go index 5b47b9e0af..9057a9b5f0 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -6,8 +6,10 @@ package ssh import ( "errors" + "fmt" "io" "net" + "slices" "strings" "sync/atomic" "testing" @@ -225,6 +227,78 @@ func TestNewServerConnValidationErrors(t *testing.T) { } } +func TestBannerError(t *testing.T) { + serverConfig := &ServerConfig{ + BannerCallback: func(ConnMetadata) string { + return "banner from BannerCallback" + }, + NoClientAuth: true, + NoClientAuthCallback: func(ConnMetadata) (*Permissions, error) { + err := &BannerError{ + Err: errors.New("error from NoClientAuthCallback"), + Message: "banner from NoClientAuthCallback", + } + return nil, fmt.Errorf("wrapped: %w", err) + }, + PasswordCallback: func(conn ConnMetadata, password []byte) (*Permissions, error) { + return &Permissions{}, nil + }, + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + return nil, &BannerError{ + Err: errors.New("error from PublicKeyCallback"), + Message: "banner from PublicKeyCallback", + } + }, + KeyboardInteractiveCallback: func(conn ConnMetadata, client KeyboardInteractiveChallenge) (*Permissions, error) { + return nil, &BannerError{ + Err: nil, // make sure that a nil inner error is allowed + Message: "banner from KeyboardInteractiveCallback", + } + }, + } + serverConfig.AddHostKey(testSigners["rsa"]) + + var banners []string + clientConfig := &ClientConfig{ + User: "test", + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"]), + KeyboardInteractive(func(name, instruction string, questions []string, echos []bool) ([]string, error) { + return []string{"letmein"}, nil + }), + Password(clientPassword), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + BannerCallback: func(msg string) error { + banners = append(banners, msg) + return nil + }, + } + + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + go newServer(c1, serverConfig) + c, _, _, err := NewClientConn(c2, "", clientConfig) + if err != nil { + t.Fatalf("client connection failed: %v", err) + } + defer c.Close() + + wantBanners := []string{ + "banner from BannerCallback", + "banner from NoClientAuthCallback", + "banner from PublicKeyCallback", + "banner from KeyboardInteractiveCallback", + } + if !slices.Equal(banners, wantBanners) { + t.Errorf("got banners:\n%q\nwant banners:\n%q", banners, wantBanners) + } +} + type markerConn struct { closed uint32 used uint32 From 349231f7e4e437ea89847c5dfce63eed67949f86 Mon Sep 17 00:00:00 2001 From: Mariano Cano Date: Thu, 7 Sep 2023 18:54:31 -0700 Subject: [PATCH 15/57] ssh: implement CryptoPublicKey on sk keys This commit implements the CryptoPublicKey interface for the skECDSAPublicKey and skEd25519PublicKey types. Fixes golang/go#62518 Change-Id: I2b8ac89196fbb3614bf5c675127bed23f1cf6b26 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/526875 LUCI-TryBot-Result: Go LUCI Reviewed-by: Matthew Dempsky Reviewed-by: Than McIntosh Auto-Submit: Nicola Murino Reviewed-by: Nicola Murino --- ssh/keys.go | 8 ++++++++ ssh/keys_test.go | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/ssh/keys.go b/ssh/keys.go index df4ebdada5..7967665f17 100644 --- a/ssh/keys.go +++ b/ssh/keys.go @@ -904,6 +904,10 @@ func (k *skECDSAPublicKey) Verify(data []byte, sig *Signature) error { return errors.New("ssh: signature did not verify") } +func (k *skECDSAPublicKey) CryptoPublicKey() crypto.PublicKey { + return &k.PublicKey +} + type skEd25519PublicKey struct { // application is a URL-like string, typically "ssh:" for SSH. // see openssh/PROTOCOL.u2f for details. @@ -1000,6 +1004,10 @@ func (k *skEd25519PublicKey) Verify(data []byte, sig *Signature) error { return nil } +func (k *skEd25519PublicKey) CryptoPublicKey() crypto.PublicKey { + return k.PublicKey +} + // NewSignerFromKey takes an *rsa.PrivateKey, *dsa.PrivateKey, // *ecdsa.PrivateKey or any other crypto.Signer and returns a // corresponding Signer instance. ECDSA keys must use P-256, P-384 or diff --git a/ssh/keys_test.go b/ssh/keys_test.go index 36e1857039..7b14429e17 100644 --- a/ssh/keys_test.go +++ b/ssh/keys_test.go @@ -726,3 +726,49 @@ func TestNewSignerWithAlgos(t *testing.T) { t.Error("signer with algos created with restricted algorithms") } } + +func TestCryptoPublicKey(t *testing.T) { + for _, priv := range testSigners { + p1 := priv.PublicKey() + key, ok := p1.(CryptoPublicKey) + if !ok { + continue + } + p2, err := NewPublicKey(key.CryptoPublicKey()) + if err != nil { + t.Fatalf("NewPublicKey(CryptoPublicKey) failed for %s, got: %v", p1.Type(), err) + } + if !reflect.DeepEqual(p1, p2) { + t.Errorf("got %#v in NewPublicKey, want %#v", p2, p1) + } + } + for _, d := range testdata.SKData { + p1, _, _, _, err := ParseAuthorizedKey(d.PubKey) + if err != nil { + t.Fatalf("parseAuthorizedKey returned error: %v", err) + } + k1, ok := p1.(CryptoPublicKey) + if !ok { + t.Fatalf("%T does not implement CryptoPublicKey", p1) + } + + var p2 PublicKey + switch pub := k1.CryptoPublicKey().(type) { + case *ecdsa.PublicKey: + p2 = &skECDSAPublicKey{ + application: "ssh:", + PublicKey: *pub, + } + case ed25519.PublicKey: + p2 = &skEd25519PublicKey{ + application: "ssh:", + PublicKey: pub, + } + default: + t.Fatalf("unexpected type %T from CryptoPublicKey()", pub) + } + if !reflect.DeepEqual(p1, p2) { + t.Errorf("got %#v, want %#v", p2, p1) + } + } +} From 0b431c7de36a66b1b5c54f6219ad1413824cd1fd Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 3 Jun 2024 16:01:22 +0000 Subject: [PATCH 16/57] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: I8a1b9637e83214674e6fe82ebf584e9b90446ca3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/589875 Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Gopher Robot Reviewed-by: Damien Neil --- x509roots/fallback/bundle.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index 460c57b4d8..e56011afa0 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -1526,6 +1526,24 @@ u/8j72gZyxKTJ1wDLW8w0B62GqzeWvfRqqgnpv55gcR5mTNXuhKwqeBCbJPKVt7+ bYQLCIt+jerXmCHG8+c8eS9enNFMFY3h7CI3zJpDC5fcgJCNs2ebb0gIFVbPv/Er fF6adulZkMV8gzURZVE= -----END CERTIFICATE----- +# CN=FIRMAPROFESIONAL CA ROOT-A WEB,O=Firmaprofesional SA,C=ES,2.5.4.97=#130f56415445532d413632363334303638 +# bef256daf26e9c69bdec1602359798f3caf71821a03e018257c53c65617f3d4a +-----BEGIN CERTIFICATE----- +MIICejCCAgCgAwIBAgIQMZch7a+JQn81QYehZ1ZMbTAKBggqhkjOPQQDAzBuMQsw +CQYDVQQGEwJFUzEcMBoGA1UECgwTRmlybWFwcm9mZXNpb25hbCBTQTEYMBYGA1UE +YQwPVkFURVMtQTYyNjM0MDY4MScwJQYDVQQDDB5GSVJNQVBST0ZFU0lPTkFMIENB +IFJPT1QtQSBXRUIwHhcNMjIwNDA2MDkwMTM2WhcNNDcwMzMxMDkwMTM2WjBuMQsw +CQYDVQQGEwJFUzEcMBoGA1UECgwTRmlybWFwcm9mZXNpb25hbCBTQTEYMBYGA1UE +YQwPVkFURVMtQTYyNjM0MDY4MScwJQYDVQQDDB5GSVJNQVBST0ZFU0lPTkFMIENB +IFJPT1QtQSBXRUIwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAARHU+osEaR3xyrq89Zf +e9MEkVz6iMYiuYMQYneEMy3pA4jU4DP37XcsSmDq5G+tbbT4TIqk5B/K6k84Si6C +cyvHZpsKjECcfIr28jlgst7L7Ljkb+qbXbdTkBgyVcUgt5SjYzBhMA8GA1UdEwEB +/wQFMAMBAf8wHwYDVR0jBBgwFoAUk+FDY1w8ndYn81LsF7Kpryz3dvgwHQYDVR0O +BBYEFJPhQ2NcPJ3WJ/NS7Beyqa8s93b4MA4GA1UdDwEB/wQEAwIBBjAKBggqhkjO +PQQDAwNoADBlAjAdfKR7w4l1M+E7qUW/Runpod3JIha3RxEL2Jq68cgLcFBTApFw +hVmpHqTm6iMxoAACMQD94vizrxa5HnPEluPBMBnYfubDl94cT7iJLzPrSA8Z94dG +XSaQpYXFuXqUPoeovQA= +-----END CERTIFICATE----- # CN=GDCA TrustAUTH R5 ROOT,O=GUANG DONG CERTIFICATE AUTHORITY CO.\,LTD.,C=CN # bfff8fd04433487d6a8aa60c1a29767a9fc2bbb05e420f713a13b992891d3893 -----BEGIN CERTIFICATE----- From 332fd656f4f013f66e643818fe8c759538456535 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Tue, 4 Jun 2024 16:16:04 +0000 Subject: [PATCH 17/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I105ee0f343768881d4fe3a2bfd1fcbaa7e1fd705 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/590218 Auto-Submit: Gopher Robot Reviewed-by: Than McIntosh LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index e5e51cb287..cc1cf7ebed 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.18 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.20.0 - golang.org/x/term v0.20.0 + golang.org/x/sys v0.21.0 + golang.org/x/term v0.21.0 ) -require golang.org/x/text v0.15.0 // indirect +require golang.org/x/text v0.16.0 // indirect diff --git a/go.sum b/go.sum index ea7e57f729..b694c49e61 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= -golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= -golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= -golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= -golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= From d4e7c9cb6cb8bb64ad4a1988cd26328ef6cb9023 Mon Sep 17 00:00:00 2001 From: samiponkanen Date: Fri, 17 May 2024 12:09:53 +0000 Subject: [PATCH 18/57] ssh: fail client auth immediately on receiving disconnect message Fixes golang/go#66991 Change-Id: I60dd8a807578f162fda0e49bcd6fbf289d444396 GitHub-Last-Rev: f88329d35712873d0d7e3b39b9b11e7bfbc28e71 GitHub-Pull-Request: golang/crypto#293 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/581075 Reviewed-by: Cherry Mui Reviewed-by: Roland Shoemaker Auto-Submit: Nicola Murino Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI --- ssh/client_auth.go | 4 +++ ssh/test/session_test.go | 69 ++++++++++++++++++++++++++++++++++++++ ssh/test/test_unix_test.go | 9 +++-- 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/ssh/client_auth.go b/ssh/client_auth.go index 9486c59862..b93961010d 100644 --- a/ssh/client_auth.go +++ b/ssh/client_auth.go @@ -71,6 +71,10 @@ func (c *connection) clientAuthenticate(config *ClientConfig) error { for auth := AuthMethod(new(noneAuth)); auth != nil; { ok, methods, err := auth.auth(sessionID, config.User, c.transport, config.Rand, extensions) if err != nil { + // On disconnect, return error immediately + if _, ok := err.(*disconnectMsg); ok { + return err + } // We return the error later if there is no other method left to // try. ok = authFailure diff --git a/ssh/test/session_test.go b/ssh/test/session_test.go index e9bfa9ec27..53e6645633 100644 --- a/ssh/test/session_test.go +++ b/ssh/test/session_test.go @@ -468,3 +468,72 @@ func TestClientAuthAlgorithms(t *testing.T) { }) } } + +func TestClientAuthDisconnect(t *testing.T) { + // Use a static key that is not accepted by server. + // This key has been generated with following ssh-keygen command and + // used exclusively in this unit test: + // $ ssh-keygen -t RSA -b 2048 -f /tmp/static_key \ + // -C "Static RSA key for golang.org/x/crypto/ssh unit test" + + const privKeyData = `-----BEGIN OPENSSH PRIVATE KEY----- +b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABFwAAAAdzc2gtcn +NhAAAAAwEAAQAAAQEAwV1Zg3MqX27nIQQNWd8V09P4q4F1fx7H2xNJdL3Yg3y91GFLJ92+ +0IiGV8n1VMGL/71PPhzyqBpUYSTpWjiU2JZSfA+iTg1GJBcOaEOA6vrXsTtXTHZ//mOT4d +mlvuP4+9NqfCBLGXN7ZJpT+amkD8AVW9YW9QN3ipY61ZWxPaAocVpDd8rVgJTk54KvaPa7 +t4ddOSQDQq61aubIDR1Z3P+XjkB4piWOsbck3HJL+veTALy12C09tAhwUnZUAXS+DjhxOL +xpDVclF/yXYhAvBvsjwyk/OC3+nK9F799hpQZsjxmbP7oN+tGwz06BUcAKi7u7QstENvvk +85SDZy1q1QAAA/A7ylbJO8pWyQAAAAdzc2gtcnNhAAABAQDBXVmDcypfbuchBA1Z3xXT0/ +irgXV/HsfbE0l0vdiDfL3UYUsn3b7QiIZXyfVUwYv/vU8+HPKoGlRhJOlaOJTYllJ8D6JO +DUYkFw5oQ4Dq+texO1dMdn/+Y5Ph2aW+4/j702p8IEsZc3tkmlP5qaQPwBVb1hb1A3eKlj +rVlbE9oChxWkN3ytWAlOTngq9o9ru3h105JANCrrVq5sgNHVnc/5eOQHimJY6xtyTcckv6 +95MAvLXYLT20CHBSdlQBdL4OOHE4vGkNVyUX/JdiEC8G+yPDKT84Lf6cr0Xv32GlBmyPGZ +s/ug360bDPToFRwAqLu7tCy0Q2++TzlINnLWrVAAAAAwEAAQAAAQAIvPDHMiyIxgCksGPF +uyv9F9U4XjVip8/abE9zkAMJWW5++wuT/bRlBOUPRrWIXZEM9ETbtsqswo3Wxah+7CjRIH +qR7SdFlYTP1jPk4yIKXF4OvggBUPySkMpAGJ9hwOMW8Ymcb4gn77JJ4aMoWIcXssje+XiC +8iO+4UWU3SV2i6K7flK1UDCI5JVCyBr3DVf3QhMOgvwJl9TgD7FzWy1hkjuZq/Pzdv+fA2 +OfrUFiSukLNolidNoI9+KWa1yxixE+B2oN4Xan3ZbqGbL6Wc1dB+K9h/bNcu+SKf7fXWRi +/vVG44A61xGDZzen1+eQlqFp7narkKXoaU71+45VXDThAAAAgBPWUdQykEEm0yOS6hPIW+ +hS8z1LXWGTEcag9fMwJXKE7cQFO3LEk+dXMbClHdhD/ydswOZYGSNepxwvmo/a5LiO2ulp +W+5tnsNhcK3skdaf71t+boUEXBNZ6u3WNTkU7tDN8h9tebI+xlNceDGSGjOlNoHQVMKZdA +W9TA4ZqXUPAAAAgQDWU0UZVOSCAOODPz4PYsbFKdCfXNP8O4+t9txyc9E3hsLAsVs+CpVX +Gr219MGLrublzAxojipyzuQb6Tp1l9nsu7VkcBrPL8I1tokz0AyTnmNF3A9KszBal7gGNS +a2qYuf6JO4cub1KzonxUJQHZPZq9YhCxOtDwTd+uyHZiPy9QAAAIEA5vayd+nfVJgCKTdf +z5MFsxBSUj/cAYg7JYPS/0bZ5bEkLosL22wl5Tm/ZftJa8apkyBPhguAWt6jEWLoDiK+kn +Fv0SaEq1HUdXgWmISVnWzv2pxdAtq/apmbxTg3iIJyrAwEDo13iImR3k6rNPx1m3i/jX56 +HLcvWM4Y6bFzbGEAAAA0U3RhdGljIFJTQSBrZXkgZm9yIGdvbGFuZy5vcmcveC9jcnlwdG +8vc3NoIHVuaXQgdGVzdAECAwQFBgc= +-----END OPENSSH PRIVATE KEY-----` + + signer, err := ssh.ParsePrivateKey([]byte(privKeyData)) + if err != nil { + t.Fatalf("failed to create signer from key: %v", err) + } + + // Start server with MaxAuthTries 1 and publickey and password auth + // enabled + server := newServerForConfig(t, "MaxAuthTries", map[string]string{}) + + // Connect to server, expect failure, that PublicKeysCallback is called + // and that PasswordCallback is not called. + publicKeysCallbackCalled := false + config := clientConfig() + config.Auth = []ssh.AuthMethod{ + ssh.PublicKeysCallback(func() ([]ssh.Signer, error) { + publicKeysCallbackCalled = true + return []ssh.Signer{signer}, nil + }), + ssh.PasswordCallback(func() (string, error) { + t.Errorf("unexpected call to PasswordCallback()") + return "notaverygoodpassword", nil + }), + } + client, err := server.TryDial(config) + if err == nil { + t.Errorf("expected TryDial() to fail") + _ = client.Close() + } + if !publicKeysCallbackCalled { + t.Errorf("expected PublicKeysCallback() to be called") + } +} diff --git a/ssh/test/test_unix_test.go b/ssh/test/test_unix_test.go index 9695de7319..12698e49ab 100644 --- a/ssh/test/test_unix_test.go +++ b/ssh/test/test_unix_test.go @@ -58,12 +58,17 @@ UsePAM yes PasswordAuthentication yes ChallengeResponseAuthentication yes AuthenticationMethods {{.AuthMethods}} +` + maxAuthTriesSshdConfigTail = ` +PasswordAuthentication yes +MaxAuthTries 1 ` ) var configTmpl = map[string]*template.Template{ - "default": template.Must(template.New("").Parse(defaultSshdConfig)), - "MultiAuth": template.Must(template.New("").Parse(defaultSshdConfig + multiAuthSshdConfigTail))} + "default": template.Must(template.New("").Parse(defaultSshdConfig)), + "MultiAuth": template.Must(template.New("").Parse(defaultSshdConfig + multiAuthSshdConfigTail)), + "MaxAuthTries": template.Must(template.New("").Parse(defaultSshdConfig + maxAuthTriesSshdConfigTail))} type server struct { t *testing.T From 1c7450041f58a5e714eb084ae0e670d3576ea9fb Mon Sep 17 00:00:00 2001 From: cuishuang Date: Tue, 25 Jun 2024 00:01:42 +0800 Subject: [PATCH 19/57] ssh/test: make struct comment match struct name Change-Id: I9bfd61fe96d2bdaa890379a1a31b7e0f3f2b67ed Reviewed-on: https://go-review.googlesource.com/c/crypto/+/594435 Reviewed-by: Ian Lance Taylor Auto-Submit: Ian Lance Taylor Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI Run-TryBot: shuang cui Reviewed-by: Joedian Reid TryBot-Result: Gopher Robot --- ssh/test/server_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ssh/test/server_test.go b/ssh/test/server_test.go index e70241bc6e..5c04fba98c 100644 --- a/ssh/test/server_test.go +++ b/ssh/test/server_test.go @@ -14,7 +14,7 @@ type exitStatusMsg struct { Status uint32 } -// goServer is a test Go SSH server that accepts public key and certificate +// goTestServer is a test Go SSH server that accepts public key and certificate // authentication and replies with a 0 exit status to any exec request without // running any commands. type goTestServer struct { From a6a393ffd658b286f64f141b06cbd94e516d3a64 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 24 Jun 2024 13:31:21 +0200 Subject: [PATCH 20/57] all: bump go.mod version and drop compatibility shims Also, remove the legacy import annotations. Fixes golang/go#68147 Change-Id: Ibfcc9322f27224c0ba92ea42cd56912a7d8783fd Reviewed-on: https://go-review.googlesource.com/c/crypto/+/594256 Reviewed-by: Dmitri Shuralyov Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker --- acme/http.go | 21 +- acme/version_go112.go | 27 - bcrypt/bcrypt.go | 2 +- blake2s/blake2s.go | 10 +- blake2s/register.go | 21 - blowfish/cipher.go | 2 +- bn256/bn256.go | 2 +- cast5/cast5.go | 2 +- chacha20poly1305/chacha20poly1305.go | 2 +- cryptobyte/asn1/asn1.go | 2 +- cryptobyte/string.go | 2 +- curve25519/curve25519.go | 39 +- curve25519/curve25519_compat.go | 105 ---- curve25519/curve25519_go120.go | 46 -- curve25519/internal/field/README | 7 - .../internal/field/_asm/fe_amd64_asm.go | 298 ---------- curve25519/internal/field/_asm/go.mod | 16 - curve25519/internal/field/_asm/go.sum | 51 -- curve25519/internal/field/fe.go | 416 ------------- curve25519/internal/field/fe_alias_test.go | 126 ---- curve25519/internal/field/fe_amd64.go | 15 - curve25519/internal/field/fe_amd64.s | 378 ------------ curve25519/internal/field/fe_amd64_noasm.go | 11 - curve25519/internal/field/fe_arm64.go | 15 - curve25519/internal/field/fe_arm64.s | 42 -- curve25519/internal/field/fe_arm64_noasm.go | 11 - curve25519/internal/field/fe_bench_test.go | 36 -- curve25519/internal/field/fe_generic.go | 264 --------- curve25519/internal/field/fe_test.go | 558 ------------------ curve25519/internal/field/sync.checkpoint | 1 - curve25519/internal/field/sync.sh | 19 - ed25519/ed25519.go | 4 +- go.mod | 2 +- hkdf/hkdf.go | 2 +- internal/wycheproof/ecdh_stdlib_test.go | 2 - internal/wycheproof/eddsa_test.go | 2 - md4/md4.go | 2 +- nacl/box/box.go | 2 +- nacl/secretbox/secretbox.go | 2 +- ocsp/ocsp.go | 2 +- ocsp/ocsp_test.go | 2 - openpgp/armor/armor.go | 5 +- openpgp/clearsign/clearsign.go | 2 +- openpgp/elgamal/elgamal.go | 2 +- openpgp/errors/errors.go | 2 +- openpgp/packet/packet.go | 2 +- openpgp/read.go | 2 +- openpgp/s2k/s2k.go | 2 +- otr/otr.go | 2 +- pbkdf2/pbkdf2.go | 2 +- poly1305/poly1305_compat.go | 2 +- ripemd160/ripemd160.go | 2 +- salsa20/salsa/hsalsa20.go | 2 +- salsa20/salsa20.go | 2 +- scrypt/scrypt.go | 2 +- sha3/doc.go | 2 +- sha3/hashes.go | 8 + sha3/register.go | 18 - ssh/agent/client.go | 2 +- ssh/doc.go | 2 +- ssh/test/doc.go | 2 +- ssh/testdata/doc.go | 2 +- twofish/twofish.go | 2 +- xtea/cipher.go | 2 +- xts/xts.go | 2 +- 65 files changed, 110 insertions(+), 2532 deletions(-) delete mode 100644 acme/version_go112.go delete mode 100644 blake2s/register.go delete mode 100644 curve25519/curve25519_compat.go delete mode 100644 curve25519/curve25519_go120.go delete mode 100644 curve25519/internal/field/README delete mode 100644 curve25519/internal/field/_asm/fe_amd64_asm.go delete mode 100644 curve25519/internal/field/_asm/go.mod delete mode 100644 curve25519/internal/field/_asm/go.sum delete mode 100644 curve25519/internal/field/fe.go delete mode 100644 curve25519/internal/field/fe_alias_test.go delete mode 100644 curve25519/internal/field/fe_amd64.go delete mode 100644 curve25519/internal/field/fe_amd64.s delete mode 100644 curve25519/internal/field/fe_amd64_noasm.go delete mode 100644 curve25519/internal/field/fe_arm64.go delete mode 100644 curve25519/internal/field/fe_arm64.s delete mode 100644 curve25519/internal/field/fe_arm64_noasm.go delete mode 100644 curve25519/internal/field/fe_bench_test.go delete mode 100644 curve25519/internal/field/fe_generic.go delete mode 100644 curve25519/internal/field/fe_test.go delete mode 100644 curve25519/internal/field/sync.checkpoint delete mode 100755 curve25519/internal/field/sync.sh delete mode 100644 sha3/register.go diff --git a/acme/http.go b/acme/http.go index 58836e5d30..d92ff232fe 100644 --- a/acme/http.go +++ b/acme/http.go @@ -15,6 +15,7 @@ import ( "io" "math/big" "net/http" + "runtime/debug" "strconv" "strings" "time" @@ -271,9 +272,27 @@ func (c *Client) httpClient() *http.Client { } // packageVersion is the version of the module that contains this package, for -// sending as part of the User-Agent header. It's set in version_go112.go. +// sending as part of the User-Agent header. var packageVersion string +func init() { + // Set packageVersion if the binary was built in modules mode and x/crypto + // was not replaced with a different module. + info, ok := debug.ReadBuildInfo() + if !ok { + return + } + for _, m := range info.Deps { + if m.Path != "golang.org/x/crypto" { + continue + } + if m.Replace == nil { + packageVersion = m.Version + } + break + } +} + // userAgent returns the User-Agent header value. It includes the package name, // the module version (if available), and the c.UserAgent value (if set). func (c *Client) userAgent() string { diff --git a/acme/version_go112.go b/acme/version_go112.go deleted file mode 100644 index cc5fab604b..0000000000 --- a/acme/version_go112.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.12 - -package acme - -import "runtime/debug" - -func init() { - // Set packageVersion if the binary was built in modules mode and x/crypto - // was not replaced with a different module. - info, ok := debug.ReadBuildInfo() - if !ok { - return - } - for _, m := range info.Deps { - if m.Path != "golang.org/x/crypto" { - continue - } - if m.Replace == nil { - packageVersion = m.Version - } - break - } -} diff --git a/bcrypt/bcrypt.go b/bcrypt/bcrypt.go index 5577c0f939..dc9311870a 100644 --- a/bcrypt/bcrypt.go +++ b/bcrypt/bcrypt.go @@ -4,7 +4,7 @@ // Package bcrypt implements Provos and Mazières's bcrypt adaptive hashing // algorithm. See http://www.usenix.org/event/usenix99/provos/provos.pdf -package bcrypt // import "golang.org/x/crypto/bcrypt" +package bcrypt // The code is a port of Provos and Mazières's C implementation. import ( diff --git a/blake2s/blake2s.go b/blake2s/blake2s.go index e3f46aab3a..c25d07d4f4 100644 --- a/blake2s/blake2s.go +++ b/blake2s/blake2s.go @@ -16,9 +16,10 @@ // // BLAKE2X is a construction to compute hash values larger than 32 bytes. It // can produce hash values between 0 and 65535 bytes. -package blake2s // import "golang.org/x/crypto/blake2s" +package blake2s import ( + "crypto" "encoding/binary" "errors" "hash" @@ -55,6 +56,13 @@ func Sum256(data []byte) [Size]byte { // and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash. func New256(key []byte) (hash.Hash, error) { return newDigest(Size, key) } +func init() { + crypto.RegisterHash(crypto.BLAKE2s_256, func() hash.Hash { + h, _ := New256(nil) + return h + }) +} + // New128 returns a new hash.Hash computing the BLAKE2s-128 checksum given a // non-empty key. Note that a 128-bit digest is too small to be secure as a // cryptographic hash and should only be used as a MAC, thus the key argument diff --git a/blake2s/register.go b/blake2s/register.go deleted file mode 100644 index 3156148a42..0000000000 --- a/blake2s/register.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.9 - -package blake2s - -import ( - "crypto" - "hash" -) - -func init() { - newHash256 := func() hash.Hash { - h, _ := New256(nil) - return h - } - - crypto.RegisterHash(crypto.BLAKE2s_256, newHash256) -} diff --git a/blowfish/cipher.go b/blowfish/cipher.go index 213bf204af..0898956807 100644 --- a/blowfish/cipher.go +++ b/blowfish/cipher.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package blowfish // import "golang.org/x/crypto/blowfish" +package blowfish // The code is a port of Bruce Schneier's C implementation. // See https://www.schneier.com/blowfish.html. diff --git a/bn256/bn256.go b/bn256/bn256.go index 5d6d198bcb..6661687551 100644 --- a/bn256/bn256.go +++ b/bn256/bn256.go @@ -23,7 +23,7 @@ // elliptic curve. This package is frozen, and not implemented in constant time. // There is a more complete implementation at github.com/cloudflare/bn256, but // note that it suffers from the same security issues of the underlying curve. -package bn256 // import "golang.org/x/crypto/bn256" +package bn256 import ( "crypto/rand" diff --git a/cast5/cast5.go b/cast5/cast5.go index 425e8eecb0..016e90215c 100644 --- a/cast5/cast5.go +++ b/cast5/cast5.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package cast5 // import "golang.org/x/crypto/cast5" +package cast5 import ( "errors" diff --git a/chacha20poly1305/chacha20poly1305.go b/chacha20poly1305/chacha20poly1305.go index 93da7322bc..8cf5d8112e 100644 --- a/chacha20poly1305/chacha20poly1305.go +++ b/chacha20poly1305/chacha20poly1305.go @@ -5,7 +5,7 @@ // Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its // extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and // draft-irtf-cfrg-xchacha-01. -package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305" +package chacha20poly1305 import ( "crypto/cipher" diff --git a/cryptobyte/asn1/asn1.go b/cryptobyte/asn1/asn1.go index cda8e3edfd..90ef6a241d 100644 --- a/cryptobyte/asn1/asn1.go +++ b/cryptobyte/asn1/asn1.go @@ -4,7 +4,7 @@ // Package asn1 contains supporting types for parsing and building ASN.1 // messages with the cryptobyte package. -package asn1 // import "golang.org/x/crypto/cryptobyte/asn1" +package asn1 // Tag represents an ASN.1 identifier octet, consisting of a tag number // (indicating a type) and class (such as context-specific or constructed). diff --git a/cryptobyte/string.go b/cryptobyte/string.go index 10692a8a31..4b0f8097f9 100644 --- a/cryptobyte/string.go +++ b/cryptobyte/string.go @@ -15,7 +15,7 @@ // // See the documentation and examples for the Builder and String types to get // started. -package cryptobyte // import "golang.org/x/crypto/cryptobyte" +package cryptobyte // String represents a string of bytes. It provides methods for parsing // fixed-length and length-prefixed values from it. diff --git a/curve25519/curve25519.go b/curve25519/curve25519.go index 00f963ea20..21ca3b2ee4 100644 --- a/curve25519/curve25519.go +++ b/curve25519/curve25519.go @@ -6,9 +6,11 @@ // performs scalar multiplication on the elliptic curve known as Curve25519. // See RFC 7748. // -// Starting in Go 1.20, this package is a wrapper for the X25519 implementation +// This package is a wrapper for the X25519 implementation // in the crypto/ecdh package. -package curve25519 // import "golang.org/x/crypto/curve25519" +package curve25519 + +import "crypto/ecdh" // ScalarMult sets dst to the product scalar * point. // @@ -16,7 +18,13 @@ package curve25519 // import "golang.org/x/crypto/curve25519" // zeroes, irrespective of the scalar. Instead, use the X25519 function, which // will return an error. func ScalarMult(dst, scalar, point *[32]byte) { - scalarMult(dst, scalar, point) + if _, err := x25519(dst, scalar[:], point[:]); err != nil { + // The only error condition for x25519 when the inputs are 32 bytes long + // is if the output would have been the all-zero value. + for i := range dst { + dst[i] = 0 + } + } } // ScalarBaseMult sets dst to the product scalar * base where base is the @@ -25,7 +33,12 @@ func ScalarMult(dst, scalar, point *[32]byte) { // It is recommended to use the X25519 function with Basepoint instead, as // copying into fixed size arrays can lead to unexpected bugs. func ScalarBaseMult(dst, scalar *[32]byte) { - scalarBaseMult(dst, scalar) + curve := ecdh.X25519() + priv, err := curve.NewPrivateKey(scalar[:]) + if err != nil { + panic("curve25519: internal error: scalarBaseMult was not 32 bytes") + } + copy(dst[:], priv.PublicKey().Bytes()) } const ( @@ -57,3 +70,21 @@ func X25519(scalar, point []byte) ([]byte, error) { var dst [32]byte return x25519(&dst, scalar, point) } + +func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { + curve := ecdh.X25519() + pub, err := curve.NewPublicKey(point) + if err != nil { + return nil, err + } + priv, err := curve.NewPrivateKey(scalar) + if err != nil { + return nil, err + } + out, err := priv.ECDH(pub) + if err != nil { + return nil, err + } + copy(dst[:], out) + return dst[:], nil +} diff --git a/curve25519/curve25519_compat.go b/curve25519/curve25519_compat.go deleted file mode 100644 index ba647e8d77..0000000000 --- a/curve25519/curve25519_compat.go +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.20 - -package curve25519 - -import ( - "crypto/subtle" - "errors" - "strconv" - - "golang.org/x/crypto/curve25519/internal/field" -) - -func scalarMult(dst, scalar, point *[32]byte) { - var e [32]byte - - copy(e[:], scalar[:]) - e[0] &= 248 - e[31] &= 127 - e[31] |= 64 - - var x1, x2, z2, x3, z3, tmp0, tmp1 field.Element - x1.SetBytes(point[:]) - x2.One() - x3.Set(&x1) - z3.One() - - swap := 0 - for pos := 254; pos >= 0; pos-- { - b := e[pos/8] >> uint(pos&7) - b &= 1 - swap ^= int(b) - x2.Swap(&x3, swap) - z2.Swap(&z3, swap) - swap = int(b) - - tmp0.Subtract(&x3, &z3) - tmp1.Subtract(&x2, &z2) - x2.Add(&x2, &z2) - z2.Add(&x3, &z3) - z3.Multiply(&tmp0, &x2) - z2.Multiply(&z2, &tmp1) - tmp0.Square(&tmp1) - tmp1.Square(&x2) - x3.Add(&z3, &z2) - z2.Subtract(&z3, &z2) - x2.Multiply(&tmp1, &tmp0) - tmp1.Subtract(&tmp1, &tmp0) - z2.Square(&z2) - - z3.Mult32(&tmp1, 121666) - x3.Square(&x3) - tmp0.Add(&tmp0, &z3) - z3.Multiply(&x1, &z2) - z2.Multiply(&tmp1, &tmp0) - } - - x2.Swap(&x3, swap) - z2.Swap(&z3, swap) - - z2.Invert(&z2) - x2.Multiply(&x2, &z2) - copy(dst[:], x2.Bytes()) -} - -func scalarBaseMult(dst, scalar *[32]byte) { - checkBasepoint() - scalarMult(dst, scalar, &basePoint) -} - -func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { - var in [32]byte - if l := len(scalar); l != 32 { - return nil, errors.New("bad scalar length: " + strconv.Itoa(l) + ", expected 32") - } - if l := len(point); l != 32 { - return nil, errors.New("bad point length: " + strconv.Itoa(l) + ", expected 32") - } - copy(in[:], scalar) - if &point[0] == &Basepoint[0] { - scalarBaseMult(dst, &in) - } else { - var base, zero [32]byte - copy(base[:], point) - scalarMult(dst, &in, &base) - if subtle.ConstantTimeCompare(dst[:], zero[:]) == 1 { - return nil, errors.New("bad input point: low order point") - } - } - return dst[:], nil -} - -func checkBasepoint() { - if subtle.ConstantTimeCompare(Basepoint, []byte{ - 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - }) != 1 { - panic("curve25519: global Basepoint value was modified") - } -} diff --git a/curve25519/curve25519_go120.go b/curve25519/curve25519_go120.go deleted file mode 100644 index 627df49727..0000000000 --- a/curve25519/curve25519_go120.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.20 - -package curve25519 - -import "crypto/ecdh" - -func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { - curve := ecdh.X25519() - pub, err := curve.NewPublicKey(point) - if err != nil { - return nil, err - } - priv, err := curve.NewPrivateKey(scalar) - if err != nil { - return nil, err - } - out, err := priv.ECDH(pub) - if err != nil { - return nil, err - } - copy(dst[:], out) - return dst[:], nil -} - -func scalarMult(dst, scalar, point *[32]byte) { - if _, err := x25519(dst, scalar[:], point[:]); err != nil { - // The only error condition for x25519 when the inputs are 32 bytes long - // is if the output would have been the all-zero value. - for i := range dst { - dst[i] = 0 - } - } -} - -func scalarBaseMult(dst, scalar *[32]byte) { - curve := ecdh.X25519() - priv, err := curve.NewPrivateKey(scalar[:]) - if err != nil { - panic("curve25519: internal error: scalarBaseMult was not 32 bytes") - } - copy(dst[:], priv.PublicKey().Bytes()) -} diff --git a/curve25519/internal/field/README b/curve25519/internal/field/README deleted file mode 100644 index e25bca7dc8..0000000000 --- a/curve25519/internal/field/README +++ /dev/null @@ -1,7 +0,0 @@ -This package is kept in sync with crypto/ed25519/internal/edwards25519/field in -the standard library. - -If there are any changes in the standard library that need to be synced to this -package, run sync.sh. It will not overwrite any local changes made since the -previous sync, so it's ok to land changes in this package first, and then sync -to the standard library later. diff --git a/curve25519/internal/field/_asm/fe_amd64_asm.go b/curve25519/internal/field/_asm/fe_amd64_asm.go deleted file mode 100644 index 1f3652987e..0000000000 --- a/curve25519/internal/field/_asm/fe_amd64_asm.go +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (c) 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package main - -import ( - "fmt" - - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/gotypes" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" - - // Ensure "go mod tidy" doesn't remove the golang.org/x/crypto module - // dependency, which is necessary to access the field.Element type. - _ "golang.org/x/crypto/curve25519" -) - -//go:generate go run . -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field - -func main() { - Package("golang.org/x/crypto/curve25519/internal/field") - ConstraintExpr("amd64,gc,!purego") - feMul() - feSquare() - Generate() -} - -type namedComponent struct { - Component - name string -} - -func (c namedComponent) String() string { return c.name } - -type uint128 struct { - name string - hi, lo GPVirtual -} - -func (c uint128) String() string { return c.name } - -func feSquare() { - TEXT("feSquare", NOSPLIT, "func(out, a *Element)") - Doc("feSquare sets out = a * a. It works like feSquareGeneric.") - Pragma("noescape") - - a := Dereference(Param("a")) - l0 := namedComponent{a.Field("l0"), "l0"} - l1 := namedComponent{a.Field("l1"), "l1"} - l2 := namedComponent{a.Field("l2"), "l2"} - l3 := namedComponent{a.Field("l3"), "l3"} - l4 := namedComponent{a.Field("l4"), "l4"} - - // r0 = l0×l0 + 19×2×(l1×l4 + l2×l3) - r0 := uint128{"r0", GP64(), GP64()} - mul64(r0, 1, l0, l0) - addMul64(r0, 38, l1, l4) - addMul64(r0, 38, l2, l3) - - // r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3 - r1 := uint128{"r1", GP64(), GP64()} - mul64(r1, 2, l0, l1) - addMul64(r1, 38, l2, l4) - addMul64(r1, 19, l3, l3) - - // r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4 - r2 := uint128{"r2", GP64(), GP64()} - mul64(r2, 2, l0, l2) - addMul64(r2, 1, l1, l1) - addMul64(r2, 38, l3, l4) - - // r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4 - r3 := uint128{"r3", GP64(), GP64()} - mul64(r3, 2, l0, l3) - addMul64(r3, 2, l1, l2) - addMul64(r3, 19, l4, l4) - - // r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2 - r4 := uint128{"r4", GP64(), GP64()} - mul64(r4, 2, l0, l4) - addMul64(r4, 2, l1, l3) - addMul64(r4, 1, l2, l2) - - Comment("First reduction chain") - maskLow51Bits := GP64() - MOVQ(Imm((1<<51)-1), maskLow51Bits) - c0, r0lo := shiftRightBy51(&r0) - c1, r1lo := shiftRightBy51(&r1) - c2, r2lo := shiftRightBy51(&r2) - c3, r3lo := shiftRightBy51(&r3) - c4, r4lo := shiftRightBy51(&r4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Second reduction chain (carryPropagate)") - // c0 = r0 >> 51 - MOVQ(r0lo, c0) - SHRQ(Imm(51), c0) - // c1 = r1 >> 51 - MOVQ(r1lo, c1) - SHRQ(Imm(51), c1) - // c2 = r2 >> 51 - MOVQ(r2lo, c2) - SHRQ(Imm(51), c2) - // c3 = r3 >> 51 - MOVQ(r3lo, c3) - SHRQ(Imm(51), c3) - // c4 = r4 >> 51 - MOVQ(r4lo, c4) - SHRQ(Imm(51), c4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Store output") - out := Dereference(Param("out")) - Store(r0lo, out.Field("l0")) - Store(r1lo, out.Field("l1")) - Store(r2lo, out.Field("l2")) - Store(r3lo, out.Field("l3")) - Store(r4lo, out.Field("l4")) - - RET() -} - -func feMul() { - TEXT("feMul", NOSPLIT, "func(out, a, b *Element)") - Doc("feMul sets out = a * b. It works like feMulGeneric.") - Pragma("noescape") - - a := Dereference(Param("a")) - a0 := namedComponent{a.Field("l0"), "a0"} - a1 := namedComponent{a.Field("l1"), "a1"} - a2 := namedComponent{a.Field("l2"), "a2"} - a3 := namedComponent{a.Field("l3"), "a3"} - a4 := namedComponent{a.Field("l4"), "a4"} - - b := Dereference(Param("b")) - b0 := namedComponent{b.Field("l0"), "b0"} - b1 := namedComponent{b.Field("l1"), "b1"} - b2 := namedComponent{b.Field("l2"), "b2"} - b3 := namedComponent{b.Field("l3"), "b3"} - b4 := namedComponent{b.Field("l4"), "b4"} - - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - r0 := uint128{"r0", GP64(), GP64()} - mul64(r0, 1, a0, b0) - addMul64(r0, 19, a1, b4) - addMul64(r0, 19, a2, b3) - addMul64(r0, 19, a3, b2) - addMul64(r0, 19, a4, b1) - - // r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2) - r1 := uint128{"r1", GP64(), GP64()} - mul64(r1, 1, a0, b1) - addMul64(r1, 1, a1, b0) - addMul64(r1, 19, a2, b4) - addMul64(r1, 19, a3, b3) - addMul64(r1, 19, a4, b2) - - // r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3) - r2 := uint128{"r2", GP64(), GP64()} - mul64(r2, 1, a0, b2) - addMul64(r2, 1, a1, b1) - addMul64(r2, 1, a2, b0) - addMul64(r2, 19, a3, b4) - addMul64(r2, 19, a4, b3) - - // r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4 - r3 := uint128{"r3", GP64(), GP64()} - mul64(r3, 1, a0, b3) - addMul64(r3, 1, a1, b2) - addMul64(r3, 1, a2, b1) - addMul64(r3, 1, a3, b0) - addMul64(r3, 19, a4, b4) - - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - r4 := uint128{"r4", GP64(), GP64()} - mul64(r4, 1, a0, b4) - addMul64(r4, 1, a1, b3) - addMul64(r4, 1, a2, b2) - addMul64(r4, 1, a3, b1) - addMul64(r4, 1, a4, b0) - - Comment("First reduction chain") - maskLow51Bits := GP64() - MOVQ(Imm((1<<51)-1), maskLow51Bits) - c0, r0lo := shiftRightBy51(&r0) - c1, r1lo := shiftRightBy51(&r1) - c2, r2lo := shiftRightBy51(&r2) - c3, r3lo := shiftRightBy51(&r3) - c4, r4lo := shiftRightBy51(&r4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Second reduction chain (carryPropagate)") - // c0 = r0 >> 51 - MOVQ(r0lo, c0) - SHRQ(Imm(51), c0) - // c1 = r1 >> 51 - MOVQ(r1lo, c1) - SHRQ(Imm(51), c1) - // c2 = r2 >> 51 - MOVQ(r2lo, c2) - SHRQ(Imm(51), c2) - // c3 = r3 >> 51 - MOVQ(r3lo, c3) - SHRQ(Imm(51), c3) - // c4 = r4 >> 51 - MOVQ(r4lo, c4) - SHRQ(Imm(51), c4) - maskAndAdd(r0lo, maskLow51Bits, c4, 19) - maskAndAdd(r1lo, maskLow51Bits, c0, 1) - maskAndAdd(r2lo, maskLow51Bits, c1, 1) - maskAndAdd(r3lo, maskLow51Bits, c2, 1) - maskAndAdd(r4lo, maskLow51Bits, c3, 1) - - Comment("Store output") - out := Dereference(Param("out")) - Store(r0lo, out.Field("l0")) - Store(r1lo, out.Field("l1")) - Store(r2lo, out.Field("l2")) - Store(r3lo, out.Field("l3")) - Store(r4lo, out.Field("l4")) - - RET() -} - -// mul64 sets r to i * aX * bX. -func mul64(r uint128, i int, aX, bX namedComponent) { - switch i { - case 1: - Comment(fmt.Sprintf("%s = %s×%s", r, aX, bX)) - Load(aX, RAX) - case 2: - Comment(fmt.Sprintf("%s = 2×%s×%s", r, aX, bX)) - Load(aX, RAX) - SHLQ(Imm(1), RAX) - default: - panic("unsupported i value") - } - MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX - MOVQ(RAX, r.lo) - MOVQ(RDX, r.hi) -} - -// addMul64 sets r to r + i * aX * bX. -func addMul64(r uint128, i uint64, aX, bX namedComponent) { - switch i { - case 1: - Comment(fmt.Sprintf("%s += %s×%s", r, aX, bX)) - Load(aX, RAX) - default: - Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX)) - IMUL3Q(Imm(i), Load(aX, GP64()), RAX) - } - MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX - ADDQ(RAX, r.lo) - ADCQ(RDX, r.hi) -} - -// shiftRightBy51 returns r >> 51 and r.lo. -// -// After this function is called, the uint128 may not be used anymore. -func shiftRightBy51(r *uint128) (out, lo GPVirtual) { - out = r.hi - lo = r.lo - SHLQ(Imm(64-51), r.lo, r.hi) - r.lo, r.hi = nil, nil // make sure the uint128 is unusable - return -} - -// maskAndAdd sets r = r&mask + c*i. -func maskAndAdd(r, mask, c GPVirtual, i uint64) { - ANDQ(mask, r) - if i != 1 { - IMUL3Q(Imm(i), c, c) - } - ADDQ(c, r) -} - -func mustAddr(c Component) Op { - b, err := c.Resolve() - if err != nil { - panic(err) - } - return b.Addr -} diff --git a/curve25519/internal/field/_asm/go.mod b/curve25519/internal/field/_asm/go.mod deleted file mode 100644 index f6902f4a64..0000000000 --- a/curve25519/internal/field/_asm/go.mod +++ /dev/null @@ -1,16 +0,0 @@ -module asm - -go 1.18 - -require ( - github.com/mmcloughlin/avo v0.5.0 - golang.org/x/crypto v0.1.0 -) - -require ( - golang.org/x/mod v0.8.0 // indirect - golang.org/x/sys v0.14.0 // indirect - golang.org/x/tools v0.6.0 // indirect -) - -replace golang.org/x/crypto v0.1.0 => ../../../.. diff --git a/curve25519/internal/field/_asm/go.sum b/curve25519/internal/field/_asm/go.sum deleted file mode 100644 index 96b7915d4f..0000000000 --- a/curve25519/internal/field/_asm/go.sum +++ /dev/null @@ -1,51 +0,0 @@ -github.com/mmcloughlin/avo v0.5.0 h1:nAco9/aI9Lg2kiuROBY6BhCI/z0t5jEvJfjWbL8qXLU= -github.com/mmcloughlin/avo v0.5.0/go.mod h1:ChHFdoV7ql95Wi7vuq2YT1bwCJqiWdZrQ1im3VujLYM= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -golang.org/x/arch v0.1.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.6.0/go.mod h1:4mET923SAdbXp2ki8ey+zGs1SLqsuM2Y0uvdZR/fUNI= -golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= -golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.2.0/go.mod h1:y4OqIKeOV/fWJetJ8bXPU1sEVniLMIyDAZWeHdV+NTA= -golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/curve25519/internal/field/fe.go b/curve25519/internal/field/fe.go deleted file mode 100644 index ca841ad99e..0000000000 --- a/curve25519/internal/field/fe.go +++ /dev/null @@ -1,416 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package field implements fast arithmetic modulo 2^255-19. -package field - -import ( - "crypto/subtle" - "encoding/binary" - "math/bits" -) - -// Element represents an element of the field GF(2^255-19). Note that this -// is not a cryptographically secure group, and should only be used to interact -// with edwards25519.Point coordinates. -// -// This type works similarly to math/big.Int, and all arguments and receivers -// are allowed to alias. -// -// The zero value is a valid zero element. -type Element struct { - // An element t represents the integer - // t.l0 + t.l1*2^51 + t.l2*2^102 + t.l3*2^153 + t.l4*2^204 - // - // Between operations, all limbs are expected to be lower than 2^52. - l0 uint64 - l1 uint64 - l2 uint64 - l3 uint64 - l4 uint64 -} - -const maskLow51Bits uint64 = (1 << 51) - 1 - -var feZero = &Element{0, 0, 0, 0, 0} - -// Zero sets v = 0, and returns v. -func (v *Element) Zero() *Element { - *v = *feZero - return v -} - -var feOne = &Element{1, 0, 0, 0, 0} - -// One sets v = 1, and returns v. -func (v *Element) One() *Element { - *v = *feOne - return v -} - -// reduce reduces v modulo 2^255 - 19 and returns it. -func (v *Element) reduce() *Element { - v.carryPropagate() - - // After the light reduction we now have a field element representation - // v < 2^255 + 2^13 * 19, but need v < 2^255 - 19. - - // If v >= 2^255 - 19, then v + 19 >= 2^255, which would overflow 2^255 - 1, - // generating a carry. That is, c will be 0 if v < 2^255 - 19, and 1 otherwise. - c := (v.l0 + 19) >> 51 - c = (v.l1 + c) >> 51 - c = (v.l2 + c) >> 51 - c = (v.l3 + c) >> 51 - c = (v.l4 + c) >> 51 - - // If v < 2^255 - 19 and c = 0, this will be a no-op. Otherwise, it's - // effectively applying the reduction identity to the carry. - v.l0 += 19 * c - - v.l1 += v.l0 >> 51 - v.l0 = v.l0 & maskLow51Bits - v.l2 += v.l1 >> 51 - v.l1 = v.l1 & maskLow51Bits - v.l3 += v.l2 >> 51 - v.l2 = v.l2 & maskLow51Bits - v.l4 += v.l3 >> 51 - v.l3 = v.l3 & maskLow51Bits - // no additional carry - v.l4 = v.l4 & maskLow51Bits - - return v -} - -// Add sets v = a + b, and returns v. -func (v *Element) Add(a, b *Element) *Element { - v.l0 = a.l0 + b.l0 - v.l1 = a.l1 + b.l1 - v.l2 = a.l2 + b.l2 - v.l3 = a.l3 + b.l3 - v.l4 = a.l4 + b.l4 - // Using the generic implementation here is actually faster than the - // assembly. Probably because the body of this function is so simple that - // the compiler can figure out better optimizations by inlining the carry - // propagation. TODO - return v.carryPropagateGeneric() -} - -// Subtract sets v = a - b, and returns v. -func (v *Element) Subtract(a, b *Element) *Element { - // We first add 2 * p, to guarantee the subtraction won't underflow, and - // then subtract b (which can be up to 2^255 + 2^13 * 19). - v.l0 = (a.l0 + 0xFFFFFFFFFFFDA) - b.l0 - v.l1 = (a.l1 + 0xFFFFFFFFFFFFE) - b.l1 - v.l2 = (a.l2 + 0xFFFFFFFFFFFFE) - b.l2 - v.l3 = (a.l3 + 0xFFFFFFFFFFFFE) - b.l3 - v.l4 = (a.l4 + 0xFFFFFFFFFFFFE) - b.l4 - return v.carryPropagate() -} - -// Negate sets v = -a, and returns v. -func (v *Element) Negate(a *Element) *Element { - return v.Subtract(feZero, a) -} - -// Invert sets v = 1/z mod p, and returns v. -// -// If z == 0, Invert returns v = 0. -func (v *Element) Invert(z *Element) *Element { - // Inversion is implemented as exponentiation with exponent p − 2. It uses the - // same sequence of 255 squarings and 11 multiplications as [Curve25519]. - var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t Element - - z2.Square(z) // 2 - t.Square(&z2) // 4 - t.Square(&t) // 8 - z9.Multiply(&t, z) // 9 - z11.Multiply(&z9, &z2) // 11 - t.Square(&z11) // 22 - z2_5_0.Multiply(&t, &z9) // 31 = 2^5 - 2^0 - - t.Square(&z2_5_0) // 2^6 - 2^1 - for i := 0; i < 4; i++ { - t.Square(&t) // 2^10 - 2^5 - } - z2_10_0.Multiply(&t, &z2_5_0) // 2^10 - 2^0 - - t.Square(&z2_10_0) // 2^11 - 2^1 - for i := 0; i < 9; i++ { - t.Square(&t) // 2^20 - 2^10 - } - z2_20_0.Multiply(&t, &z2_10_0) // 2^20 - 2^0 - - t.Square(&z2_20_0) // 2^21 - 2^1 - for i := 0; i < 19; i++ { - t.Square(&t) // 2^40 - 2^20 - } - t.Multiply(&t, &z2_20_0) // 2^40 - 2^0 - - t.Square(&t) // 2^41 - 2^1 - for i := 0; i < 9; i++ { - t.Square(&t) // 2^50 - 2^10 - } - z2_50_0.Multiply(&t, &z2_10_0) // 2^50 - 2^0 - - t.Square(&z2_50_0) // 2^51 - 2^1 - for i := 0; i < 49; i++ { - t.Square(&t) // 2^100 - 2^50 - } - z2_100_0.Multiply(&t, &z2_50_0) // 2^100 - 2^0 - - t.Square(&z2_100_0) // 2^101 - 2^1 - for i := 0; i < 99; i++ { - t.Square(&t) // 2^200 - 2^100 - } - t.Multiply(&t, &z2_100_0) // 2^200 - 2^0 - - t.Square(&t) // 2^201 - 2^1 - for i := 0; i < 49; i++ { - t.Square(&t) // 2^250 - 2^50 - } - t.Multiply(&t, &z2_50_0) // 2^250 - 2^0 - - t.Square(&t) // 2^251 - 2^1 - t.Square(&t) // 2^252 - 2^2 - t.Square(&t) // 2^253 - 2^3 - t.Square(&t) // 2^254 - 2^4 - t.Square(&t) // 2^255 - 2^5 - - return v.Multiply(&t, &z11) // 2^255 - 21 -} - -// Set sets v = a, and returns v. -func (v *Element) Set(a *Element) *Element { - *v = *a - return v -} - -// SetBytes sets v to x, which must be a 32-byte little-endian encoding. -// -// Consistent with RFC 7748, the most significant bit (the high bit of the -// last byte) is ignored, and non-canonical values (2^255-19 through 2^255-1) -// are accepted. Note that this is laxer than specified by RFC 8032. -func (v *Element) SetBytes(x []byte) *Element { - if len(x) != 32 { - panic("edwards25519: invalid field element input size") - } - - // Bits 0:51 (bytes 0:8, bits 0:64, shift 0, mask 51). - v.l0 = binary.LittleEndian.Uint64(x[0:8]) - v.l0 &= maskLow51Bits - // Bits 51:102 (bytes 6:14, bits 48:112, shift 3, mask 51). - v.l1 = binary.LittleEndian.Uint64(x[6:14]) >> 3 - v.l1 &= maskLow51Bits - // Bits 102:153 (bytes 12:20, bits 96:160, shift 6, mask 51). - v.l2 = binary.LittleEndian.Uint64(x[12:20]) >> 6 - v.l2 &= maskLow51Bits - // Bits 153:204 (bytes 19:27, bits 152:216, shift 1, mask 51). - v.l3 = binary.LittleEndian.Uint64(x[19:27]) >> 1 - v.l3 &= maskLow51Bits - // Bits 204:251 (bytes 24:32, bits 192:256, shift 12, mask 51). - // Note: not bytes 25:33, shift 4, to avoid overread. - v.l4 = binary.LittleEndian.Uint64(x[24:32]) >> 12 - v.l4 &= maskLow51Bits - - return v -} - -// Bytes returns the canonical 32-byte little-endian encoding of v. -func (v *Element) Bytes() []byte { - // This function is outlined to make the allocations inline in the caller - // rather than happen on the heap. - var out [32]byte - return v.bytes(&out) -} - -func (v *Element) bytes(out *[32]byte) []byte { - t := *v - t.reduce() - - var buf [8]byte - for i, l := range [5]uint64{t.l0, t.l1, t.l2, t.l3, t.l4} { - bitsOffset := i * 51 - binary.LittleEndian.PutUint64(buf[:], l<= len(out) { - break - } - out[off] |= bb - } - } - - return out[:] -} - -// Equal returns 1 if v and u are equal, and 0 otherwise. -func (v *Element) Equal(u *Element) int { - sa, sv := u.Bytes(), v.Bytes() - return subtle.ConstantTimeCompare(sa, sv) -} - -// mask64Bits returns 0xffffffff if cond is 1, and 0 otherwise. -func mask64Bits(cond int) uint64 { return ^(uint64(cond) - 1) } - -// Select sets v to a if cond == 1, and to b if cond == 0. -func (v *Element) Select(a, b *Element, cond int) *Element { - m := mask64Bits(cond) - v.l0 = (m & a.l0) | (^m & b.l0) - v.l1 = (m & a.l1) | (^m & b.l1) - v.l2 = (m & a.l2) | (^m & b.l2) - v.l3 = (m & a.l3) | (^m & b.l3) - v.l4 = (m & a.l4) | (^m & b.l4) - return v -} - -// Swap swaps v and u if cond == 1 or leaves them unchanged if cond == 0, and returns v. -func (v *Element) Swap(u *Element, cond int) { - m := mask64Bits(cond) - t := m & (v.l0 ^ u.l0) - v.l0 ^= t - u.l0 ^= t - t = m & (v.l1 ^ u.l1) - v.l1 ^= t - u.l1 ^= t - t = m & (v.l2 ^ u.l2) - v.l2 ^= t - u.l2 ^= t - t = m & (v.l3 ^ u.l3) - v.l3 ^= t - u.l3 ^= t - t = m & (v.l4 ^ u.l4) - v.l4 ^= t - u.l4 ^= t -} - -// IsNegative returns 1 if v is negative, and 0 otherwise. -func (v *Element) IsNegative() int { - return int(v.Bytes()[0] & 1) -} - -// Absolute sets v to |u|, and returns v. -func (v *Element) Absolute(u *Element) *Element { - return v.Select(new(Element).Negate(u), u, u.IsNegative()) -} - -// Multiply sets v = x * y, and returns v. -func (v *Element) Multiply(x, y *Element) *Element { - feMul(v, x, y) - return v -} - -// Square sets v = x * x, and returns v. -func (v *Element) Square(x *Element) *Element { - feSquare(v, x) - return v -} - -// Mult32 sets v = x * y, and returns v. -func (v *Element) Mult32(x *Element, y uint32) *Element { - x0lo, x0hi := mul51(x.l0, y) - x1lo, x1hi := mul51(x.l1, y) - x2lo, x2hi := mul51(x.l2, y) - x3lo, x3hi := mul51(x.l3, y) - x4lo, x4hi := mul51(x.l4, y) - v.l0 = x0lo + 19*x4hi // carried over per the reduction identity - v.l1 = x1lo + x0hi - v.l2 = x2lo + x1hi - v.l3 = x3lo + x2hi - v.l4 = x4lo + x3hi - // The hi portions are going to be only 32 bits, plus any previous excess, - // so we can skip the carry propagation. - return v -} - -// mul51 returns lo + hi * 2⁵¹ = a * b. -func mul51(a uint64, b uint32) (lo uint64, hi uint64) { - mh, ml := bits.Mul64(a, uint64(b)) - lo = ml & maskLow51Bits - hi = (mh << 13) | (ml >> 51) - return -} - -// Pow22523 set v = x^((p-5)/8), and returns v. (p-5)/8 is 2^252-3. -func (v *Element) Pow22523(x *Element) *Element { - var t0, t1, t2 Element - - t0.Square(x) // x^2 - t1.Square(&t0) // x^4 - t1.Square(&t1) // x^8 - t1.Multiply(x, &t1) // x^9 - t0.Multiply(&t0, &t1) // x^11 - t0.Square(&t0) // x^22 - t0.Multiply(&t1, &t0) // x^31 - t1.Square(&t0) // x^62 - for i := 1; i < 5; i++ { // x^992 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // x^1023 -> 1023 = 2^10 - 1 - t1.Square(&t0) // 2^11 - 2 - for i := 1; i < 10; i++ { // 2^20 - 2^10 - t1.Square(&t1) - } - t1.Multiply(&t1, &t0) // 2^20 - 1 - t2.Square(&t1) // 2^21 - 2 - for i := 1; i < 20; i++ { // 2^40 - 2^20 - t2.Square(&t2) - } - t1.Multiply(&t2, &t1) // 2^40 - 1 - t1.Square(&t1) // 2^41 - 2 - for i := 1; i < 10; i++ { // 2^50 - 2^10 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // 2^50 - 1 - t1.Square(&t0) // 2^51 - 2 - for i := 1; i < 50; i++ { // 2^100 - 2^50 - t1.Square(&t1) - } - t1.Multiply(&t1, &t0) // 2^100 - 1 - t2.Square(&t1) // 2^101 - 2 - for i := 1; i < 100; i++ { // 2^200 - 2^100 - t2.Square(&t2) - } - t1.Multiply(&t2, &t1) // 2^200 - 1 - t1.Square(&t1) // 2^201 - 2 - for i := 1; i < 50; i++ { // 2^250 - 2^50 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // 2^250 - 1 - t0.Square(&t0) // 2^251 - 2 - t0.Square(&t0) // 2^252 - 4 - return v.Multiply(&t0, x) // 2^252 - 3 -> x^(2^252-3) -} - -// sqrtM1 is 2^((p-1)/4), which squared is equal to -1 by Euler's Criterion. -var sqrtM1 = &Element{1718705420411056, 234908883556509, - 2233514472574048, 2117202627021982, 765476049583133} - -// SqrtRatio sets r to the non-negative square root of the ratio of u and v. -// -// If u/v is square, SqrtRatio returns r and 1. If u/v is not square, SqrtRatio -// sets r according to Section 4.3 of draft-irtf-cfrg-ristretto255-decaf448-00, -// and returns r and 0. -func (r *Element) SqrtRatio(u, v *Element) (rr *Element, wasSquare int) { - var a, b Element - - // r = (u * v3) * (u * v7)^((p-5)/8) - v2 := a.Square(v) - uv3 := b.Multiply(u, b.Multiply(v2, v)) - uv7 := a.Multiply(uv3, a.Square(v2)) - r.Multiply(uv3, r.Pow22523(uv7)) - - check := a.Multiply(v, a.Square(r)) // check = v * r^2 - - uNeg := b.Negate(u) - correctSignSqrt := check.Equal(u) - flippedSignSqrt := check.Equal(uNeg) - flippedSignSqrtI := check.Equal(uNeg.Multiply(uNeg, sqrtM1)) - - rPrime := b.Multiply(r, sqrtM1) // r_prime = SQRT_M1 * r - // r = CT_SELECT(r_prime IF flipped_sign_sqrt | flipped_sign_sqrt_i ELSE r) - r.Select(rPrime, r, flippedSignSqrt|flippedSignSqrtI) - - r.Absolute(r) // Choose the nonnegative square root. - return r, correctSignSqrt | flippedSignSqrt -} diff --git a/curve25519/internal/field/fe_alias_test.go b/curve25519/internal/field/fe_alias_test.go deleted file mode 100644 index 64e57c4f35..0000000000 --- a/curve25519/internal/field/fe_alias_test.go +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import ( - "testing" - "testing/quick" -) - -func checkAliasingOneArg(f func(v, x *Element) *Element) func(v, x Element) bool { - return func(v, x Element) bool { - x1, v1 := x, x - - // Calculate a reference f(x) without aliasing. - if out := f(&v, &x); out != &v && isInBounds(out) { - return false - } - - // Test aliasing the argument and the receiver. - if out := f(&v1, &v1); out != &v1 || v1 != v { - return false - } - - // Ensure the arguments was not modified. - return x == x1 - } -} - -func checkAliasingTwoArgs(f func(v, x, y *Element) *Element) func(v, x, y Element) bool { - return func(v, x, y Element) bool { - x1, y1, v1 := x, y, Element{} - - // Calculate a reference f(x, y) without aliasing. - if out := f(&v, &x, &y); out != &v && isInBounds(out) { - return false - } - - // Test aliasing the first argument and the receiver. - v1 = x - if out := f(&v1, &v1, &y); out != &v1 || v1 != v { - return false - } - // Test aliasing the second argument and the receiver. - v1 = y - if out := f(&v1, &x, &v1); out != &v1 || v1 != v { - return false - } - - // Calculate a reference f(x, x) without aliasing. - if out := f(&v, &x, &x); out != &v { - return false - } - - // Test aliasing the first argument and the receiver. - v1 = x - if out := f(&v1, &v1, &x); out != &v1 || v1 != v { - return false - } - // Test aliasing the second argument and the receiver. - v1 = x - if out := f(&v1, &x, &v1); out != &v1 || v1 != v { - return false - } - // Test aliasing both arguments and the receiver. - v1 = x - if out := f(&v1, &v1, &v1); out != &v1 || v1 != v { - return false - } - - // Ensure the arguments were not modified. - return x == x1 && y == y1 - } -} - -// TestAliasing checks that receivers and arguments can alias each other without -// leading to incorrect results. That is, it ensures that it's safe to write -// -// v.Invert(v) -// -// or -// -// v.Add(v, v) -// -// without any of the inputs getting clobbered by the output being written. -func TestAliasing(t *testing.T) { - type target struct { - name string - oneArgF func(v, x *Element) *Element - twoArgsF func(v, x, y *Element) *Element - } - for _, tt := range []target{ - {name: "Absolute", oneArgF: (*Element).Absolute}, - {name: "Invert", oneArgF: (*Element).Invert}, - {name: "Negate", oneArgF: (*Element).Negate}, - {name: "Set", oneArgF: (*Element).Set}, - {name: "Square", oneArgF: (*Element).Square}, - {name: "Multiply", twoArgsF: (*Element).Multiply}, - {name: "Add", twoArgsF: (*Element).Add}, - {name: "Subtract", twoArgsF: (*Element).Subtract}, - { - name: "Select0", - twoArgsF: func(v, x, y *Element) *Element { - return (*Element).Select(v, x, y, 0) - }, - }, - { - name: "Select1", - twoArgsF: func(v, x, y *Element) *Element { - return (*Element).Select(v, x, y, 1) - }, - }, - } { - var err error - switch { - case tt.oneArgF != nil: - err = quick.Check(checkAliasingOneArg(tt.oneArgF), &quick.Config{MaxCountScale: 1 << 8}) - case tt.twoArgsF != nil: - err = quick.Check(checkAliasingTwoArgs(tt.twoArgsF), &quick.Config{MaxCountScale: 1 << 8}) - } - if err != nil { - t.Errorf("%v: %v", tt.name, err) - } - } -} diff --git a/curve25519/internal/field/fe_amd64.go b/curve25519/internal/field/fe_amd64.go deleted file mode 100644 index 70c541692c..0000000000 --- a/curve25519/internal/field/fe_amd64.go +++ /dev/null @@ -1,15 +0,0 @@ -// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -package field - -// feMul sets out = a * b. It works like feMulGeneric. -// -//go:noescape -func feMul(out *Element, a *Element, b *Element) - -// feSquare sets out = a * a. It works like feSquareGeneric. -// -//go:noescape -func feSquare(out *Element, a *Element) diff --git a/curve25519/internal/field/fe_amd64.s b/curve25519/internal/field/fe_amd64.s deleted file mode 100644 index 60817acc41..0000000000 --- a/curve25519/internal/field/fe_amd64.s +++ /dev/null @@ -1,378 +0,0 @@ -// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -#include "textflag.h" - -// func feMul(out *Element, a *Element, b *Element) -TEXT ·feMul(SB), NOSPLIT, $0-24 - MOVQ a+8(FP), CX - MOVQ b+16(FP), BX - - // r0 = a0×b0 - MOVQ (CX), AX - MULQ (BX) - MOVQ AX, DI - MOVQ DX, SI - - // r0 += 19×a1×b4 - MOVQ 8(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a2×b3 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a3×b2 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a4×b1 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 8(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r1 = a0×b1 - MOVQ (CX), AX - MULQ 8(BX) - MOVQ AX, R9 - MOVQ DX, R8 - - // r1 += a1×b0 - MOVQ 8(CX), AX - MULQ (BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a2×b4 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a3×b3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a4×b2 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r2 = a0×b2 - MOVQ (CX), AX - MULQ 16(BX) - MOVQ AX, R11 - MOVQ DX, R10 - - // r2 += a1×b1 - MOVQ 8(CX), AX - MULQ 8(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += a2×b0 - MOVQ 16(CX), AX - MULQ (BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += 19×a3×b4 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += 19×a4×b3 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r3 = a0×b3 - MOVQ (CX), AX - MULQ 24(BX) - MOVQ AX, R13 - MOVQ DX, R12 - - // r3 += a1×b2 - MOVQ 8(CX), AX - MULQ 16(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += a2×b1 - MOVQ 16(CX), AX - MULQ 8(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += a3×b0 - MOVQ 24(CX), AX - MULQ (BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += 19×a4×b4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r4 = a0×b4 - MOVQ (CX), AX - MULQ 32(BX) - MOVQ AX, R15 - MOVQ DX, R14 - - // r4 += a1×b3 - MOVQ 8(CX), AX - MULQ 24(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a2×b2 - MOVQ 16(CX), AX - MULQ 16(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a3×b1 - MOVQ 24(CX), AX - MULQ 8(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a4×b0 - MOVQ 32(CX), AX - MULQ (BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // First reduction chain - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, DI, SI - SHLQ $0x0d, R9, R8 - SHLQ $0x0d, R11, R10 - SHLQ $0x0d, R13, R12 - SHLQ $0x0d, R15, R14 - ANDQ AX, DI - IMUL3Q $0x13, R14, R14 - ADDQ R14, DI - ANDQ AX, R9 - ADDQ SI, R9 - ANDQ AX, R11 - ADDQ R8, R11 - ANDQ AX, R13 - ADDQ R10, R13 - ANDQ AX, R15 - ADDQ R12, R15 - - // Second reduction chain (carryPropagate) - MOVQ DI, SI - SHRQ $0x33, SI - MOVQ R9, R8 - SHRQ $0x33, R8 - MOVQ R11, R10 - SHRQ $0x33, R10 - MOVQ R13, R12 - SHRQ $0x33, R12 - MOVQ R15, R14 - SHRQ $0x33, R14 - ANDQ AX, DI - IMUL3Q $0x13, R14, R14 - ADDQ R14, DI - ANDQ AX, R9 - ADDQ SI, R9 - ANDQ AX, R11 - ADDQ R8, R11 - ANDQ AX, R13 - ADDQ R10, R13 - ANDQ AX, R15 - ADDQ R12, R15 - - // Store output - MOVQ out+0(FP), AX - MOVQ DI, (AX) - MOVQ R9, 8(AX) - MOVQ R11, 16(AX) - MOVQ R13, 24(AX) - MOVQ R15, 32(AX) - RET - -// func feSquare(out *Element, a *Element) -TEXT ·feSquare(SB), NOSPLIT, $0-16 - MOVQ a+8(FP), CX - - // r0 = l0×l0 - MOVQ (CX), AX - MULQ (CX) - MOVQ AX, SI - MOVQ DX, BX - - // r0 += 38×l1×l4 - MOVQ 8(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, SI - ADCQ DX, BX - - // r0 += 38×l2×l3 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 24(CX) - ADDQ AX, SI - ADCQ DX, BX - - // r1 = 2×l0×l1 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 8(CX) - MOVQ AX, R8 - MOVQ DX, DI - - // r1 += 38×l2×l4 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R8 - ADCQ DX, DI - - // r1 += 19×l3×l3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(CX) - ADDQ AX, R8 - ADCQ DX, DI - - // r2 = 2×l0×l2 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 16(CX) - MOVQ AX, R10 - MOVQ DX, R9 - - // r2 += l1×l1 - MOVQ 8(CX), AX - MULQ 8(CX) - ADDQ AX, R10 - ADCQ DX, R9 - - // r2 += 38×l3×l4 - MOVQ 24(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R10 - ADCQ DX, R9 - - // r3 = 2×l0×l3 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 24(CX) - MOVQ AX, R12 - MOVQ DX, R11 - - // r3 += 2×l1×l2 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 16(CX) - ADDQ AX, R12 - ADCQ DX, R11 - - // r3 += 19×l4×l4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(CX) - ADDQ AX, R12 - ADCQ DX, R11 - - // r4 = 2×l0×l4 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 32(CX) - MOVQ AX, R14 - MOVQ DX, R13 - - // r4 += 2×l1×l3 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 24(CX) - ADDQ AX, R14 - ADCQ DX, R13 - - // r4 += l2×l2 - MOVQ 16(CX), AX - MULQ 16(CX) - ADDQ AX, R14 - ADCQ DX, R13 - - // First reduction chain - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, SI, BX - SHLQ $0x0d, R8, DI - SHLQ $0x0d, R10, R9 - SHLQ $0x0d, R12, R11 - SHLQ $0x0d, R14, R13 - ANDQ AX, SI - IMUL3Q $0x13, R13, R13 - ADDQ R13, SI - ANDQ AX, R8 - ADDQ BX, R8 - ANDQ AX, R10 - ADDQ DI, R10 - ANDQ AX, R12 - ADDQ R9, R12 - ANDQ AX, R14 - ADDQ R11, R14 - - // Second reduction chain (carryPropagate) - MOVQ SI, BX - SHRQ $0x33, BX - MOVQ R8, DI - SHRQ $0x33, DI - MOVQ R10, R9 - SHRQ $0x33, R9 - MOVQ R12, R11 - SHRQ $0x33, R11 - MOVQ R14, R13 - SHRQ $0x33, R13 - ANDQ AX, SI - IMUL3Q $0x13, R13, R13 - ADDQ R13, SI - ANDQ AX, R8 - ADDQ BX, R8 - ANDQ AX, R10 - ADDQ DI, R10 - ANDQ AX, R12 - ADDQ R9, R12 - ANDQ AX, R14 - ADDQ R11, R14 - - // Store output - MOVQ out+0(FP), AX - MOVQ SI, (AX) - MOVQ R8, 8(AX) - MOVQ R10, 16(AX) - MOVQ R12, 24(AX) - MOVQ R14, 32(AX) - RET diff --git a/curve25519/internal/field/fe_amd64_noasm.go b/curve25519/internal/field/fe_amd64_noasm.go deleted file mode 100644 index 9da280d1d8..0000000000 --- a/curve25519/internal/field/fe_amd64_noasm.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !amd64 || !gc || purego - -package field - -func feMul(v, x, y *Element) { feMulGeneric(v, x, y) } - -func feSquare(v, x *Element) { feSquareGeneric(v, x) } diff --git a/curve25519/internal/field/fe_arm64.go b/curve25519/internal/field/fe_arm64.go deleted file mode 100644 index 075fe9b925..0000000000 --- a/curve25519/internal/field/fe_arm64.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego - -package field - -//go:noescape -func carryPropagate(v *Element) - -func (v *Element) carryPropagate() *Element { - carryPropagate(v) - return v -} diff --git a/curve25519/internal/field/fe_arm64.s b/curve25519/internal/field/fe_arm64.s deleted file mode 100644 index 3126a43419..0000000000 --- a/curve25519/internal/field/fe_arm64.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego - -#include "textflag.h" - -// carryPropagate works exactly like carryPropagateGeneric and uses the -// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but -// avoids loading R0-R4 twice and uses LDP and STP. -// -// See https://golang.org/issues/43145 for the main compiler issue. -// -// func carryPropagate(v *Element) -TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8 - MOVD v+0(FP), R20 - - LDP 0(R20), (R0, R1) - LDP 16(R20), (R2, R3) - MOVD 32(R20), R4 - - AND $0x7ffffffffffff, R0, R10 - AND $0x7ffffffffffff, R1, R11 - AND $0x7ffffffffffff, R2, R12 - AND $0x7ffffffffffff, R3, R13 - AND $0x7ffffffffffff, R4, R14 - - ADD R0>>51, R11, R11 - ADD R1>>51, R12, R12 - ADD R2>>51, R13, R13 - ADD R3>>51, R14, R14 - // R4>>51 * 19 + R10 -> R10 - LSR $51, R4, R21 - MOVD $19, R22 - MADD R22, R10, R21, R10 - - STP (R10, R11), 0(R20) - STP (R12, R13), 16(R20) - MOVD R14, 32(R20) - - RET diff --git a/curve25519/internal/field/fe_arm64_noasm.go b/curve25519/internal/field/fe_arm64_noasm.go deleted file mode 100644 index fc029ac12d..0000000000 --- a/curve25519/internal/field/fe_arm64_noasm.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !arm64 || !gc || purego - -package field - -func (v *Element) carryPropagate() *Element { - return v.carryPropagateGeneric() -} diff --git a/curve25519/internal/field/fe_bench_test.go b/curve25519/internal/field/fe_bench_test.go deleted file mode 100644 index 77dc06cf98..0000000000 --- a/curve25519/internal/field/fe_bench_test.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import "testing" - -func BenchmarkAdd(b *testing.B) { - var x, y Element - x.One() - y.Add(feOne, feOne) - b.ResetTimer() - for i := 0; i < b.N; i++ { - x.Add(&x, &y) - } -} - -func BenchmarkMultiply(b *testing.B) { - var x, y Element - x.One() - y.Add(feOne, feOne) - b.ResetTimer() - for i := 0; i < b.N; i++ { - x.Multiply(&x, &y) - } -} - -func BenchmarkMult32(b *testing.B) { - var x Element - x.One() - b.ResetTimer() - for i := 0; i < b.N; i++ { - x.Mult32(&x, 0xaa42aa42) - } -} diff --git a/curve25519/internal/field/fe_generic.go b/curve25519/internal/field/fe_generic.go deleted file mode 100644 index 2671217da5..0000000000 --- a/curve25519/internal/field/fe_generic.go +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import "math/bits" - -// uint128 holds a 128-bit number as two 64-bit limbs, for use with the -// bits.Mul64 and bits.Add64 intrinsics. -type uint128 struct { - lo, hi uint64 -} - -// mul64 returns a * b. -func mul64(a, b uint64) uint128 { - hi, lo := bits.Mul64(a, b) - return uint128{lo, hi} -} - -// addMul64 returns v + a * b. -func addMul64(v uint128, a, b uint64) uint128 { - hi, lo := bits.Mul64(a, b) - lo, c := bits.Add64(lo, v.lo, 0) - hi, _ = bits.Add64(hi, v.hi, c) - return uint128{lo, hi} -} - -// shiftRightBy51 returns a >> 51. a is assumed to be at most 115 bits. -func shiftRightBy51(a uint128) uint64 { - return (a.hi << (64 - 51)) | (a.lo >> 51) -} - -func feMulGeneric(v, a, b *Element) { - a0 := a.l0 - a1 := a.l1 - a2 := a.l2 - a3 := a.l3 - a4 := a.l4 - - b0 := b.l0 - b1 := b.l1 - b2 := b.l2 - b3 := b.l3 - b4 := b.l4 - - // Limb multiplication works like pen-and-paper columnar multiplication, but - // with 51-bit limbs instead of digits. - // - // a4 a3 a2 a1 a0 x - // b4 b3 b2 b1 b0 = - // ------------------------ - // a4b0 a3b0 a2b0 a1b0 a0b0 + - // a4b1 a3b1 a2b1 a1b1 a0b1 + - // a4b2 a3b2 a2b2 a1b2 a0b2 + - // a4b3 a3b3 a2b3 a1b3 a0b3 + - // a4b4 a3b4 a2b4 a1b4 a0b4 = - // ---------------------------------------------- - // r8 r7 r6 r5 r4 r3 r2 r1 r0 - // - // We can then use the reduction identity (a * 2²⁵⁵ + b = a * 19 + b) to - // reduce the limbs that would overflow 255 bits. r5 * 2²⁵⁵ becomes 19 * r5, - // r6 * 2³⁰⁶ becomes 19 * r6 * 2⁵¹, etc. - // - // Reduction can be carried out simultaneously to multiplication. For - // example, we do not compute r5: whenever the result of a multiplication - // belongs to r5, like a1b4, we multiply it by 19 and add the result to r0. - // - // a4b0 a3b0 a2b0 a1b0 a0b0 + - // a3b1 a2b1 a1b1 a0b1 19×a4b1 + - // a2b2 a1b2 a0b2 19×a4b2 19×a3b2 + - // a1b3 a0b3 19×a4b3 19×a3b3 19×a2b3 + - // a0b4 19×a4b4 19×a3b4 19×a2b4 19×a1b4 = - // -------------------------------------- - // r4 r3 r2 r1 r0 - // - // Finally we add up the columns into wide, overlapping limbs. - - a1_19 := a1 * 19 - a2_19 := a2 * 19 - a3_19 := a3 * 19 - a4_19 := a4 * 19 - - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - r0 := mul64(a0, b0) - r0 = addMul64(r0, a1_19, b4) - r0 = addMul64(r0, a2_19, b3) - r0 = addMul64(r0, a3_19, b2) - r0 = addMul64(r0, a4_19, b1) - - // r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2) - r1 := mul64(a0, b1) - r1 = addMul64(r1, a1, b0) - r1 = addMul64(r1, a2_19, b4) - r1 = addMul64(r1, a3_19, b3) - r1 = addMul64(r1, a4_19, b2) - - // r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3) - r2 := mul64(a0, b2) - r2 = addMul64(r2, a1, b1) - r2 = addMul64(r2, a2, b0) - r2 = addMul64(r2, a3_19, b4) - r2 = addMul64(r2, a4_19, b3) - - // r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4 - r3 := mul64(a0, b3) - r3 = addMul64(r3, a1, b2) - r3 = addMul64(r3, a2, b1) - r3 = addMul64(r3, a3, b0) - r3 = addMul64(r3, a4_19, b4) - - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - r4 := mul64(a0, b4) - r4 = addMul64(r4, a1, b3) - r4 = addMul64(r4, a2, b2) - r4 = addMul64(r4, a3, b1) - r4 = addMul64(r4, a4, b0) - - // After the multiplication, we need to reduce (carry) the five coefficients - // to obtain a result with limbs that are at most slightly larger than 2⁵¹, - // to respect the Element invariant. - // - // Overall, the reduction works the same as carryPropagate, except with - // wider inputs: we take the carry for each coefficient by shifting it right - // by 51, and add it to the limb above it. The top carry is multiplied by 19 - // according to the reduction identity and added to the lowest limb. - // - // The largest coefficient (r0) will be at most 111 bits, which guarantees - // that all carries are at most 111 - 51 = 60 bits, which fits in a uint64. - // - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - // r0 < 2⁵²×2⁵² + 19×(2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵²) - // r0 < (1 + 19 × 4) × 2⁵² × 2⁵² - // r0 < 2⁷ × 2⁵² × 2⁵² - // r0 < 2¹¹¹ - // - // Moreover, the top coefficient (r4) is at most 107 bits, so c4 is at most - // 56 bits, and c4 * 19 is at most 61 bits, which again fits in a uint64 and - // allows us to easily apply the reduction identity. - // - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - // r4 < 5 × 2⁵² × 2⁵² - // r4 < 2¹⁰⁷ - // - - c0 := shiftRightBy51(r0) - c1 := shiftRightBy51(r1) - c2 := shiftRightBy51(r2) - c3 := shiftRightBy51(r3) - c4 := shiftRightBy51(r4) - - rr0 := r0.lo&maskLow51Bits + c4*19 - rr1 := r1.lo&maskLow51Bits + c0 - rr2 := r2.lo&maskLow51Bits + c1 - rr3 := r3.lo&maskLow51Bits + c2 - rr4 := r4.lo&maskLow51Bits + c3 - - // Now all coefficients fit into 64-bit registers but are still too large to - // be passed around as a Element. We therefore do one last carry chain, - // where the carries will be small enough to fit in the wiggle room above 2⁵¹. - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() -} - -func feSquareGeneric(v, a *Element) { - l0 := a.l0 - l1 := a.l1 - l2 := a.l2 - l3 := a.l3 - l4 := a.l4 - - // Squaring works precisely like multiplication above, but thanks to its - // symmetry we get to group a few terms together. - // - // l4 l3 l2 l1 l0 x - // l4 l3 l2 l1 l0 = - // ------------------------ - // l4l0 l3l0 l2l0 l1l0 l0l0 + - // l4l1 l3l1 l2l1 l1l1 l0l1 + - // l4l2 l3l2 l2l2 l1l2 l0l2 + - // l4l3 l3l3 l2l3 l1l3 l0l3 + - // l4l4 l3l4 l2l4 l1l4 l0l4 = - // ---------------------------------------------- - // r8 r7 r6 r5 r4 r3 r2 r1 r0 - // - // l4l0 l3l0 l2l0 l1l0 l0l0 + - // l3l1 l2l1 l1l1 l0l1 19×l4l1 + - // l2l2 l1l2 l0l2 19×l4l2 19×l3l2 + - // l1l3 l0l3 19×l4l3 19×l3l3 19×l2l3 + - // l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4 = - // -------------------------------------- - // r4 r3 r2 r1 r0 - // - // With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with - // only three Mul64 and four Add64, instead of five and eight. - - l0_2 := l0 * 2 - l1_2 := l1 * 2 - - l1_38 := l1 * 38 - l2_38 := l2 * 38 - l3_38 := l3 * 38 - - l3_19 := l3 * 19 - l4_19 := l4 * 19 - - // r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3) - r0 := mul64(l0, l0) - r0 = addMul64(r0, l1_38, l4) - r0 = addMul64(r0, l2_38, l3) - - // r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3 - r1 := mul64(l0_2, l1) - r1 = addMul64(r1, l2_38, l4) - r1 = addMul64(r1, l3_19, l3) - - // r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4 - r2 := mul64(l0_2, l2) - r2 = addMul64(r2, l1, l1) - r2 = addMul64(r2, l3_38, l4) - - // r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4 - r3 := mul64(l0_2, l3) - r3 = addMul64(r3, l1_2, l2) - r3 = addMul64(r3, l4_19, l4) - - // r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2 - r4 := mul64(l0_2, l4) - r4 = addMul64(r4, l1_2, l3) - r4 = addMul64(r4, l2, l2) - - c0 := shiftRightBy51(r0) - c1 := shiftRightBy51(r1) - c2 := shiftRightBy51(r2) - c3 := shiftRightBy51(r3) - c4 := shiftRightBy51(r4) - - rr0 := r0.lo&maskLow51Bits + c4*19 - rr1 := r1.lo&maskLow51Bits + c0 - rr2 := r2.lo&maskLow51Bits + c1 - rr3 := r3.lo&maskLow51Bits + c2 - rr4 := r4.lo&maskLow51Bits + c3 - - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() -} - -// carryPropagateGeneric brings the limbs below 52 bits by applying the reduction -// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry. TODO inline -func (v *Element) carryPropagateGeneric() *Element { - c0 := v.l0 >> 51 - c1 := v.l1 >> 51 - c2 := v.l2 >> 51 - c3 := v.l3 >> 51 - c4 := v.l4 >> 51 - - v.l0 = v.l0&maskLow51Bits + c4*19 - v.l1 = v.l1&maskLow51Bits + c0 - v.l2 = v.l2&maskLow51Bits + c1 - v.l3 = v.l3&maskLow51Bits + c2 - v.l4 = v.l4&maskLow51Bits + c3 - - return v -} diff --git a/curve25519/internal/field/fe_test.go b/curve25519/internal/field/fe_test.go deleted file mode 100644 index b484459ff2..0000000000 --- a/curve25519/internal/field/fe_test.go +++ /dev/null @@ -1,558 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import ( - "bytes" - "crypto/rand" - "encoding/hex" - "io" - "math/big" - "math/bits" - mathrand "math/rand" - "reflect" - "testing" - "testing/quick" -) - -func (v Element) String() string { - return hex.EncodeToString(v.Bytes()) -} - -// quickCheckConfig1024 will make each quickcheck test run (1024 * -quickchecks) -// times. The default value of -quickchecks is 100. -var quickCheckConfig1024 = &quick.Config{MaxCountScale: 1 << 10} - -func generateFieldElement(rand *mathrand.Rand) Element { - const maskLow52Bits = (1 << 52) - 1 - return Element{ - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow52Bits, - } -} - -// weirdLimbs can be combined to generate a range of edge-case field elements. -// 0 and -1 are intentionally more weighted, as they combine well. -var ( - weirdLimbs51 = []uint64{ - 0, 0, 0, 0, - 1, - 19 - 1, - 19, - 0x2aaaaaaaaaaaa, - 0x5555555555555, - (1 << 51) - 20, - (1 << 51) - 19, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - } - weirdLimbs52 = []uint64{ - 0, 0, 0, 0, 0, 0, - 1, - 19 - 1, - 19, - 0x2aaaaaaaaaaaa, - 0x5555555555555, - (1 << 51) - 20, - (1 << 51) - 19, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - 1 << 51, - (1 << 51) + 1, - (1 << 52) - 19, - (1 << 52) - 1, - } -) - -func generateWeirdFieldElement(rand *mathrand.Rand) Element { - return Element{ - weirdLimbs52[rand.Intn(len(weirdLimbs52))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - } -} - -func (Element) Generate(rand *mathrand.Rand, size int) reflect.Value { - if rand.Intn(2) == 0 { - return reflect.ValueOf(generateWeirdFieldElement(rand)) - } - return reflect.ValueOf(generateFieldElement(rand)) -} - -// isInBounds returns whether the element is within the expected bit size bounds -// after a light reduction. -func isInBounds(x *Element) bool { - return bits.Len64(x.l0) <= 52 && - bits.Len64(x.l1) <= 52 && - bits.Len64(x.l2) <= 52 && - bits.Len64(x.l3) <= 52 && - bits.Len64(x.l4) <= 52 -} - -func TestMultiplyDistributesOverAdd(t *testing.T) { - multiplyDistributesOverAdd := func(x, y, z Element) bool { - // Compute t1 = (x+y)*z - t1 := new(Element) - t1.Add(&x, &y) - t1.Multiply(t1, &z) - - // Compute t2 = x*z + y*z - t2 := new(Element) - t3 := new(Element) - t2.Multiply(&x, &z) - t3.Multiply(&y, &z) - t2.Add(t2, t3) - - return t1.Equal(t2) == 1 && isInBounds(t1) && isInBounds(t2) - } - - if err := quick.Check(multiplyDistributesOverAdd, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func TestMul64to128(t *testing.T) { - a := uint64(5) - b := uint64(5) - r := mul64(a, b) - if r.lo != 0x19 || r.hi != 0 { - t.Errorf("lo-range wide mult failed, got %d + %d*(2**64)", r.lo, r.hi) - } - - a = uint64(18014398509481983) // 2^54 - 1 - b = uint64(18014398509481983) // 2^54 - 1 - r = mul64(a, b) - if r.lo != 0xff80000000000001 || r.hi != 0xfffffffffff { - t.Errorf("hi-range wide mult failed, got %d + %d*(2**64)", r.lo, r.hi) - } - - a = uint64(1125899906842661) - b = uint64(2097155) - r = mul64(a, b) - r = addMul64(r, a, b) - r = addMul64(r, a, b) - r = addMul64(r, a, b) - r = addMul64(r, a, b) - if r.lo != 16888498990613035 || r.hi != 640 { - t.Errorf("wrong answer: %d + %d*(2**64)", r.lo, r.hi) - } -} - -func TestSetBytesRoundTrip(t *testing.T) { - f1 := func(in [32]byte, fe Element) bool { - fe.SetBytes(in[:]) - - // Mask the most significant bit as it's ignored by SetBytes. (Now - // instead of earlier so we check the masking in SetBytes is working.) - in[len(in)-1] &= (1 << 7) - 1 - - return bytes.Equal(in[:], fe.Bytes()) && isInBounds(&fe) - } - if err := quick.Check(f1, nil); err != nil { - t.Errorf("failed bytes->FE->bytes round-trip: %v", err) - } - - f2 := func(fe, r Element) bool { - r.SetBytes(fe.Bytes()) - - // Intentionally not using Equal not to go through Bytes again. - // Calling reduce because both Generate and SetBytes can produce - // non-canonical representations. - fe.reduce() - r.reduce() - return fe == r - } - if err := quick.Check(f2, nil); err != nil { - t.Errorf("failed FE->bytes->FE round-trip: %v", err) - } - - // Check some fixed vectors from dalek - type feRTTest struct { - fe Element - b []byte - } - var tests = []feRTTest{ - { - fe: Element{358744748052810, 1691584618240980, 977650209285361, 1429865912637724, 560044844278676}, - b: []byte{74, 209, 69, 197, 70, 70, 161, 222, 56, 226, 229, 19, 112, 60, 25, 92, 187, 74, 222, 56, 50, 153, 51, 233, 40, 74, 57, 6, 160, 185, 213, 31}, - }, - { - fe: Element{84926274344903, 473620666599931, 365590438845504, 1028470286882429, 2146499180330972}, - b: []byte{199, 23, 106, 112, 61, 77, 216, 79, 186, 60, 11, 118, 13, 16, 103, 15, 42, 32, 83, 250, 44, 57, 204, 198, 78, 199, 253, 119, 146, 172, 3, 122}, - }, - } - - for _, tt := range tests { - b := tt.fe.Bytes() - if !bytes.Equal(b, tt.b) || new(Element).SetBytes(tt.b).Equal(&tt.fe) != 1 { - t.Errorf("Failed fixed roundtrip: %v", tt) - } - } -} - -func swapEndianness(buf []byte) []byte { - for i := 0; i < len(buf)/2; i++ { - buf[i], buf[len(buf)-i-1] = buf[len(buf)-i-1], buf[i] - } - return buf -} - -func TestBytesBigEquivalence(t *testing.T) { - f1 := func(in [32]byte, fe, fe1 Element) bool { - fe.SetBytes(in[:]) - - in[len(in)-1] &= (1 << 7) - 1 // mask the most significant bit - b := new(big.Int).SetBytes(swapEndianness(in[:])) - fe1.fromBig(b) - - if fe != fe1 { - return false - } - - buf := make([]byte, 32) // pad with zeroes - copy(buf, swapEndianness(fe1.toBig().Bytes())) - - return bytes.Equal(fe.Bytes(), buf) && isInBounds(&fe) && isInBounds(&fe1) - } - if err := quick.Check(f1, nil); err != nil { - t.Error(err) - } -} - -// fromBig sets v = n, and returns v. The bit length of n must not exceed 256. -func (v *Element) fromBig(n *big.Int) *Element { - if n.BitLen() > 32*8 { - panic("edwards25519: invalid field element input size") - } - - buf := make([]byte, 0, 32) - for _, word := range n.Bits() { - for i := 0; i < bits.UintSize; i += 8 { - if len(buf) >= cap(buf) { - break - } - buf = append(buf, byte(word)) - word >>= 8 - } - } - - return v.SetBytes(buf[:32]) -} - -func (v *Element) fromDecimal(s string) *Element { - n, ok := new(big.Int).SetString(s, 10) - if !ok { - panic("not a valid decimal: " + s) - } - return v.fromBig(n) -} - -// toBig returns v as a big.Int. -func (v *Element) toBig() *big.Int { - buf := v.Bytes() - - words := make([]big.Word, 32*8/bits.UintSize) - for n := range words { - for i := 0; i < bits.UintSize; i += 8 { - if len(buf) == 0 { - break - } - words[n] |= big.Word(buf[0]) << big.Word(i) - buf = buf[1:] - } - } - - return new(big.Int).SetBits(words) -} - -func TestDecimalConstants(t *testing.T) { - sqrtM1String := "19681161376707505956807079304988542015446066515923890162744021073123829784752" - if exp := new(Element).fromDecimal(sqrtM1String); sqrtM1.Equal(exp) != 1 { - t.Errorf("sqrtM1 is %v, expected %v", sqrtM1, exp) - } - // d is in the parent package, and we don't want to expose d or fromDecimal. - // dString := "37095705934669439343138083508754565189542113879843219016388785533085940283555" - // if exp := new(Element).fromDecimal(dString); d.Equal(exp) != 1 { - // t.Errorf("d is %v, expected %v", d, exp) - // } -} - -func TestSetBytesRoundTripEdgeCases(t *testing.T) { - // TODO: values close to 0, close to 2^255-19, between 2^255-19 and 2^255-1, - // and between 2^255 and 2^256-1. Test both the documented SetBytes - // behavior, and that Bytes reduces them. -} - -// Tests self-consistency between Multiply and Square. -func TestConsistency(t *testing.T) { - var x Element - var x2, x2sq Element - - x = Element{1, 1, 1, 1, 1} - x2.Multiply(&x, &x) - x2sq.Square(&x) - - if x2 != x2sq { - t.Fatalf("all ones failed\nmul: %x\nsqr: %x\n", x2, x2sq) - } - - var bytes [32]byte - - _, err := io.ReadFull(rand.Reader, bytes[:]) - if err != nil { - t.Fatal(err) - } - x.SetBytes(bytes[:]) - - x2.Multiply(&x, &x) - x2sq.Square(&x) - - if x2 != x2sq { - t.Fatalf("all ones failed\nmul: %x\nsqr: %x\n", x2, x2sq) - } -} - -func TestEqual(t *testing.T) { - x := Element{1, 1, 1, 1, 1} - y := Element{5, 4, 3, 2, 1} - - eq := x.Equal(&x) - if eq != 1 { - t.Errorf("wrong about equality") - } - - eq = x.Equal(&y) - if eq != 0 { - t.Errorf("wrong about inequality") - } -} - -func TestInvert(t *testing.T) { - x := Element{1, 1, 1, 1, 1} - one := Element{1, 0, 0, 0, 0} - var xinv, r Element - - xinv.Invert(&x) - r.Multiply(&x, &xinv) - r.reduce() - - if one != r { - t.Errorf("inversion identity failed, got: %x", r) - } - - var bytes [32]byte - - _, err := io.ReadFull(rand.Reader, bytes[:]) - if err != nil { - t.Fatal(err) - } - x.SetBytes(bytes[:]) - - xinv.Invert(&x) - r.Multiply(&x, &xinv) - r.reduce() - - if one != r { - t.Errorf("random inversion identity failed, got: %x for field element %x", r, x) - } - - zero := Element{} - x.Set(&zero) - if xx := xinv.Invert(&x); xx != &xinv { - t.Errorf("inverting zero did not return the receiver") - } else if xinv.Equal(&zero) != 1 { - t.Errorf("inverting zero did not return zero") - } -} - -func TestSelectSwap(t *testing.T) { - a := Element{358744748052810, 1691584618240980, 977650209285361, 1429865912637724, 560044844278676} - b := Element{84926274344903, 473620666599931, 365590438845504, 1028470286882429, 2146499180330972} - - var c, d Element - - c.Select(&a, &b, 1) - d.Select(&a, &b, 0) - - if c.Equal(&a) != 1 || d.Equal(&b) != 1 { - t.Errorf("Select failed") - } - - c.Swap(&d, 0) - - if c.Equal(&a) != 1 || d.Equal(&b) != 1 { - t.Errorf("Swap failed") - } - - c.Swap(&d, 1) - - if c.Equal(&b) != 1 || d.Equal(&a) != 1 { - t.Errorf("Swap failed") - } -} - -func TestMult32(t *testing.T) { - mult32EquivalentToMul := func(x Element, y uint32) bool { - t1 := new(Element) - for i := 0; i < 100; i++ { - t1.Mult32(&x, y) - } - - ty := new(Element) - ty.l0 = uint64(y) - - t2 := new(Element) - for i := 0; i < 100; i++ { - t2.Multiply(&x, ty) - } - - return t1.Equal(t2) == 1 && isInBounds(t1) && isInBounds(t2) - } - - if err := quick.Check(mult32EquivalentToMul, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func TestSqrtRatio(t *testing.T) { - // From draft-irtf-cfrg-ristretto255-decaf448-00, Appendix A.4. - type test struct { - u, v string - wasSquare int - r string - } - var tests = []test{ - // If u is 0, the function is defined to return (0, TRUE), even if v - // is zero. Note that where used in this package, the denominator v - // is never zero. - { - "0000000000000000000000000000000000000000000000000000000000000000", - "0000000000000000000000000000000000000000000000000000000000000000", - 1, "0000000000000000000000000000000000000000000000000000000000000000", - }, - // 0/1 == 0² - { - "0000000000000000000000000000000000000000000000000000000000000000", - "0100000000000000000000000000000000000000000000000000000000000000", - 1, "0000000000000000000000000000000000000000000000000000000000000000", - }, - // If u is non-zero and v is zero, defined to return (0, FALSE). - { - "0100000000000000000000000000000000000000000000000000000000000000", - "0000000000000000000000000000000000000000000000000000000000000000", - 0, "0000000000000000000000000000000000000000000000000000000000000000", - }, - // 2/1 is not square in this field. - { - "0200000000000000000000000000000000000000000000000000000000000000", - "0100000000000000000000000000000000000000000000000000000000000000", - 0, "3c5ff1b5d8e4113b871bd052f9e7bcd0582804c266ffb2d4f4203eb07fdb7c54", - }, - // 4/1 == 2² - { - "0400000000000000000000000000000000000000000000000000000000000000", - "0100000000000000000000000000000000000000000000000000000000000000", - 1, "0200000000000000000000000000000000000000000000000000000000000000", - }, - // 1/4 == (2⁻¹)² == (2^(p-2))² per Euler's theorem - { - "0100000000000000000000000000000000000000000000000000000000000000", - "0400000000000000000000000000000000000000000000000000000000000000", - 1, "f6ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff3f", - }, - } - - for i, tt := range tests { - u := new(Element).SetBytes(decodeHex(tt.u)) - v := new(Element).SetBytes(decodeHex(tt.v)) - want := new(Element).SetBytes(decodeHex(tt.r)) - got, wasSquare := new(Element).SqrtRatio(u, v) - if got.Equal(want) == 0 || wasSquare != tt.wasSquare { - t.Errorf("%d: got (%v, %v), want (%v, %v)", i, got, wasSquare, want, tt.wasSquare) - } - } -} - -func TestCarryPropagate(t *testing.T) { - asmLikeGeneric := func(a [5]uint64) bool { - t1 := &Element{a[0], a[1], a[2], a[3], a[4]} - t2 := &Element{a[0], a[1], a[2], a[3], a[4]} - - t1.carryPropagate() - t2.carryPropagateGeneric() - - if *t1 != *t2 { - t.Logf("got: %#v,\nexpected: %#v", t1, t2) - } - - return *t1 == *t2 && isInBounds(t2) - } - - if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { - t.Error(err) - } - - if !asmLikeGeneric([5]uint64{0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}) { - t.Errorf("failed for {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}") - } -} - -func TestFeSquare(t *testing.T) { - asmLikeGeneric := func(a Element) bool { - t1 := a - t2 := a - - feSquareGeneric(&t1, &t1) - feSquare(&t2, &t2) - - if t1 != t2 { - t.Logf("got: %#v,\nexpected: %#v", t1, t2) - } - - return t1 == t2 && isInBounds(&t2) - } - - if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func TestFeMul(t *testing.T) { - asmLikeGeneric := func(a, b Element) bool { - a1 := a - a2 := a - b1 := b - b2 := b - - feMulGeneric(&a1, &a1, &b1) - feMul(&a2, &a2, &b2) - - if a1 != a2 || b1 != b2 { - t.Logf("got: %#v,\nexpected: %#v", a1, a2) - t.Logf("got: %#v,\nexpected: %#v", b1, b2) - } - - return a1 == a2 && isInBounds(&a2) && - b1 == b2 && isInBounds(&b2) - } - - if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { - t.Error(err) - } -} - -func decodeHex(s string) []byte { - b, err := hex.DecodeString(s) - if err != nil { - panic(err) - } - return b -} diff --git a/curve25519/internal/field/sync.checkpoint b/curve25519/internal/field/sync.checkpoint deleted file mode 100644 index e3685f95ca..0000000000 --- a/curve25519/internal/field/sync.checkpoint +++ /dev/null @@ -1 +0,0 @@ -b0c49ae9f59d233526f8934262c5bbbe14d4358d diff --git a/curve25519/internal/field/sync.sh b/curve25519/internal/field/sync.sh deleted file mode 100755 index 1ba22a8b4c..0000000000 --- a/curve25519/internal/field/sync.sh +++ /dev/null @@ -1,19 +0,0 @@ -#! /bin/bash -set -euo pipefail - -cd "$(git rev-parse --show-toplevel)" - -STD_PATH=src/crypto/ed25519/internal/edwards25519/field -LOCAL_PATH=curve25519/internal/field -LAST_SYNC_REF=$(cat $LOCAL_PATH/sync.checkpoint) - -git fetch https://go.googlesource.com/go master - -if git diff --quiet $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH; then - echo "No changes." -else - NEW_REF=$(git rev-parse FETCH_HEAD | tee $LOCAL_PATH/sync.checkpoint) - echo "Applying changes from $LAST_SYNC_REF to $NEW_REF..." - git diff $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH | \ - git apply -3 --directory=$LOCAL_PATH -fi diff --git a/ed25519/ed25519.go b/ed25519/ed25519.go index a7828345fc..59b3a95a7d 100644 --- a/ed25519/ed25519.go +++ b/ed25519/ed25519.go @@ -11,9 +11,7 @@ // operations with the same key more efficient. This package refers to the RFC // 8032 private key as the “seed”. // -// Beginning with Go 1.13, the functionality of this package was moved to the -// standard library as crypto/ed25519. This package only acts as a compatibility -// wrapper. +// This package is a wrapper around the standard library crypto/ed25519 package. package ed25519 import ( diff --git a/go.mod b/go.mod index cc1cf7ebed..bef246f5ec 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module golang.org/x/crypto -go 1.18 +go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore diff --git a/hkdf/hkdf.go b/hkdf/hkdf.go index f4ded5fee2..3bee66294e 100644 --- a/hkdf/hkdf.go +++ b/hkdf/hkdf.go @@ -8,7 +8,7 @@ // HKDF is a cryptographic key derivation function (KDF) with the goal of // expanding limited input keying material into one or more cryptographically // strong secret keys. -package hkdf // import "golang.org/x/crypto/hkdf" +package hkdf import ( "crypto/hmac" diff --git a/internal/wycheproof/ecdh_stdlib_test.go b/internal/wycheproof/ecdh_stdlib_test.go index f7abd6b2e8..24a83e245b 100644 --- a/internal/wycheproof/ecdh_stdlib_test.go +++ b/internal/wycheproof/ecdh_stdlib_test.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.20 - package wycheproof import ( diff --git a/internal/wycheproof/eddsa_test.go b/internal/wycheproof/eddsa_test.go index 9b97490955..a74f343f02 100644 --- a/internal/wycheproof/eddsa_test.go +++ b/internal/wycheproof/eddsa_test.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.13 - package wycheproof import ( diff --git a/md4/md4.go b/md4/md4.go index d1911c2e86..7d9281e025 100644 --- a/md4/md4.go +++ b/md4/md4.go @@ -7,7 +7,7 @@ // Deprecated: MD4 is cryptographically broken and should only be used // where compatibility with legacy systems, not security, is the goal. Instead, // use a secure hash like SHA-256 (from crypto/sha256). -package md4 // import "golang.org/x/crypto/md4" +package md4 import ( "crypto" diff --git a/nacl/box/box.go b/nacl/box/box.go index 7f3b830ee2..357bdc773c 100644 --- a/nacl/box/box.go +++ b/nacl/box/box.go @@ -35,7 +35,7 @@ Anonymous sealing/opening is an extension of NaCl defined by and interoperable with libsodium: https://libsodium.gitbook.io/doc/public-key_cryptography/sealed_boxes. */ -package box // import "golang.org/x/crypto/nacl/box" +package box import ( cryptorand "crypto/rand" diff --git a/nacl/secretbox/secretbox.go b/nacl/secretbox/secretbox.go index f3c3242a04..1fe600ad03 100644 --- a/nacl/secretbox/secretbox.go +++ b/nacl/secretbox/secretbox.go @@ -32,7 +32,7 @@ chunk size. This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html. */ -package secretbox // import "golang.org/x/crypto/nacl/secretbox" +package secretbox import ( "golang.org/x/crypto/internal/alias" diff --git a/ocsp/ocsp.go b/ocsp/ocsp.go index bf2259537d..e6c645e7ce 100644 --- a/ocsp/ocsp.go +++ b/ocsp/ocsp.go @@ -5,7 +5,7 @@ // Package ocsp parses OCSP responses as specified in RFC 2560. OCSP responses // are signed messages attesting to the validity of a certificate for a small // period of time. This is used to manage revocation for X.509 certificates. -package ocsp // import "golang.org/x/crypto/ocsp" +package ocsp import ( "crypto" diff --git a/ocsp/ocsp_test.go b/ocsp/ocsp_test.go index a23d29bd1f..6dc273ec28 100644 --- a/ocsp/ocsp_test.go +++ b/ocsp/ocsp_test.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build go1.7 - package ocsp import ( diff --git a/openpgp/armor/armor.go b/openpgp/armor/armor.go index 8907183ec0..e664d127cb 100644 --- a/openpgp/armor/armor.go +++ b/openpgp/armor/armor.go @@ -10,14 +10,15 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package armor // import "golang.org/x/crypto/openpgp/armor" +package armor import ( "bufio" "bytes" "encoding/base64" - "golang.org/x/crypto/openpgp/errors" "io" + + "golang.org/x/crypto/openpgp/errors" ) // A Block represents an OpenPGP armored structure. diff --git a/openpgp/clearsign/clearsign.go b/openpgp/clearsign/clearsign.go index 644b2e078b..cea48efdcd 100644 --- a/openpgp/clearsign/clearsign.go +++ b/openpgp/clearsign/clearsign.go @@ -13,7 +13,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package clearsign // import "golang.org/x/crypto/openpgp/clearsign" +package clearsign import ( "bufio" diff --git a/openpgp/elgamal/elgamal.go b/openpgp/elgamal/elgamal.go index 743b35a120..f922bdbcaa 100644 --- a/openpgp/elgamal/elgamal.go +++ b/openpgp/elgamal/elgamal.go @@ -16,7 +16,7 @@ // https://golang.org/issue/44226), and ElGamal in the OpenPGP ecosystem has // compatibility and security issues (see https://eprint.iacr.org/2021/923). // Moreover, this package doesn't protect against side-channel attacks. -package elgamal // import "golang.org/x/crypto/openpgp/elgamal" +package elgamal import ( "crypto/rand" diff --git a/openpgp/errors/errors.go b/openpgp/errors/errors.go index 1d7a0ea05a..a328749471 100644 --- a/openpgp/errors/errors.go +++ b/openpgp/errors/errors.go @@ -9,7 +9,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package errors // import "golang.org/x/crypto/openpgp/errors" +package errors import ( "strconv" diff --git a/openpgp/packet/packet.go b/openpgp/packet/packet.go index 0a19794a8e..a84a1a214e 100644 --- a/openpgp/packet/packet.go +++ b/openpgp/packet/packet.go @@ -10,7 +10,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package packet // import "golang.org/x/crypto/openpgp/packet" +package packet import ( "bufio" diff --git a/openpgp/read.go b/openpgp/read.go index 48a8931468..cff3db9196 100644 --- a/openpgp/read.go +++ b/openpgp/read.go @@ -9,7 +9,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package openpgp // import "golang.org/x/crypto/openpgp" +package openpgp import ( "crypto" diff --git a/openpgp/s2k/s2k.go b/openpgp/s2k/s2k.go index f53244a1c7..fa1a919079 100644 --- a/openpgp/s2k/s2k.go +++ b/openpgp/s2k/s2k.go @@ -10,7 +10,7 @@ // for their specific task. If you are required to interoperate with OpenPGP // systems and need a maintained package, consider a community fork. // See https://golang.org/issue/44226. -package s2k // import "golang.org/x/crypto/openpgp/s2k" +package s2k import ( "crypto" diff --git a/otr/otr.go b/otr/otr.go index 29121e9bb6..6210c1ae61 100644 --- a/otr/otr.go +++ b/otr/otr.go @@ -8,7 +8,7 @@ // The version of OTR implemented by this package has been deprecated // (https://bugs.otr.im/lib/libotr/issues/140). An implementation of OTRv3 is // available at https://github.com/coyim/otr3. -package otr // import "golang.org/x/crypto/otr" +package otr import ( "bytes" diff --git a/pbkdf2/pbkdf2.go b/pbkdf2/pbkdf2.go index 904b57e01d..28cd99c7f3 100644 --- a/pbkdf2/pbkdf2.go +++ b/pbkdf2/pbkdf2.go @@ -16,7 +16,7 @@ Hash Functions SHA-1, SHA-224, SHA-256, SHA-384 and SHA-512 for HMAC. To choose, you can pass the `New` functions from the different SHA packages to pbkdf2.Key. */ -package pbkdf2 // import "golang.org/x/crypto/pbkdf2" +package pbkdf2 import ( "crypto/hmac" diff --git a/poly1305/poly1305_compat.go b/poly1305/poly1305_compat.go index dd975a32c9..cb9207f300 100644 --- a/poly1305/poly1305_compat.go +++ b/poly1305/poly1305_compat.go @@ -21,7 +21,7 @@ // For encryption, use the full ChaCha20-Poly1305 construction implemented by // golang.org/x/crypto/chacha20poly1305. For authentication, use a general // purpose MAC such as HMAC implemented by crypto/hmac. -package poly1305 // import "golang.org/x/crypto/poly1305" +package poly1305 import "golang.org/x/crypto/internal/poly1305" diff --git a/ripemd160/ripemd160.go b/ripemd160/ripemd160.go index cf3eeb158a..b6d33ef074 100644 --- a/ripemd160/ripemd160.go +++ b/ripemd160/ripemd160.go @@ -7,7 +7,7 @@ // Deprecated: RIPEMD-160 is a legacy hash and should not be used for new // applications. Also, this package does not and will not provide an optimized // implementation. Instead, use a modern hash like SHA-256 (from crypto/sha256). -package ripemd160 // import "golang.org/x/crypto/ripemd160" +package ripemd160 // RIPEMD-160 is designed by Hans Dobbertin, Antoon Bosselaers, and Bart // Preneel with specifications available at: diff --git a/salsa20/salsa/hsalsa20.go b/salsa20/salsa/hsalsa20.go index 3fd05b2751..3685b34458 100644 --- a/salsa20/salsa/hsalsa20.go +++ b/salsa20/salsa/hsalsa20.go @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. // Package salsa provides low-level access to functions in the Salsa family. -package salsa // import "golang.org/x/crypto/salsa20/salsa" +package salsa import "math/bits" diff --git a/salsa20/salsa20.go b/salsa20/salsa20.go index 8f4f896c70..e75c9342a8 100644 --- a/salsa20/salsa20.go +++ b/salsa20/salsa20.go @@ -19,7 +19,7 @@ This package also implements XSalsa20: a version of Salsa20 with a 24-byte nonce as specified in https://cr.yp.to/snuffle/xsalsa-20081128.pdf. Simply passing a 24-byte slice as the nonce triggers XSalsa20. */ -package salsa20 // import "golang.org/x/crypto/salsa20" +package salsa20 // TODO(agl): implement XORKeyStream12 and XORKeyStream8 - the reduced round variants of Salsa20. diff --git a/scrypt/scrypt.go b/scrypt/scrypt.go index c971a99fa6..76fa40fb20 100644 --- a/scrypt/scrypt.go +++ b/scrypt/scrypt.go @@ -5,7 +5,7 @@ // Package scrypt implements the scrypt key derivation function as defined in // Colin Percival's paper "Stronger Key Derivation via Sequential Memory-Hard // Functions" (https://www.tarsnap.com/scrypt/scrypt.pdf). -package scrypt // import "golang.org/x/crypto/scrypt" +package scrypt import ( "crypto/sha256" diff --git a/sha3/doc.go b/sha3/doc.go index decd8cf9bf..7e02309070 100644 --- a/sha3/doc.go +++ b/sha3/doc.go @@ -59,4 +59,4 @@ // They produce output of the same length, with the same security strengths // against all attacks. This means, in particular, that SHA3-256 only has // 128-bit collision resistance, because its output length is 32 bytes. -package sha3 // import "golang.org/x/crypto/sha3" +package sha3 diff --git a/sha3/hashes.go b/sha3/hashes.go index 5eae6cb922..c544b29e5f 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -9,6 +9,7 @@ package sha3 // bytes. import ( + "crypto" "hash" ) @@ -40,6 +41,13 @@ func New512() hash.Hash { return new512() } +func init() { + crypto.RegisterHash(crypto.SHA3_224, New224) + crypto.RegisterHash(crypto.SHA3_256, New256) + crypto.RegisterHash(crypto.SHA3_384, New384) + crypto.RegisterHash(crypto.SHA3_512, New512) +} + func new224Generic() *state { return &state{rate: 144, outputLen: 28, dsbyte: 0x06} } diff --git a/sha3/register.go b/sha3/register.go deleted file mode 100644 index addfd5049b..0000000000 --- a/sha3/register.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.4 - -package sha3 - -import ( - "crypto" -) - -func init() { - crypto.RegisterHash(crypto.SHA3_224, New224) - crypto.RegisterHash(crypto.SHA3_256, New256) - crypto.RegisterHash(crypto.SHA3_384, New384) - crypto.RegisterHash(crypto.SHA3_512, New512) -} diff --git a/ssh/agent/client.go b/ssh/agent/client.go index fecba8eb38..106708d289 100644 --- a/ssh/agent/client.go +++ b/ssh/agent/client.go @@ -10,7 +10,7 @@ // References: // // [PROTOCOL.agent]: https://tools.ietf.org/html/draft-miller-ssh-agent-00 -package agent // import "golang.org/x/crypto/ssh/agent" +package agent import ( "bytes" diff --git a/ssh/doc.go b/ssh/doc.go index edbe63340d..f5d352fe3a 100644 --- a/ssh/doc.go +++ b/ssh/doc.go @@ -20,4 +20,4 @@ References: This package does not fall under the stability promise of the Go language itself, so its API may be changed when pressing needs arise. */ -package ssh // import "golang.org/x/crypto/ssh" +package ssh diff --git a/ssh/test/doc.go b/ssh/test/doc.go index 198f0ca1e2..444b29966b 100644 --- a/ssh/test/doc.go +++ b/ssh/test/doc.go @@ -4,4 +4,4 @@ // Package test contains integration tests for the // golang.org/x/crypto/ssh package. -package test // import "golang.org/x/crypto/ssh/test" +package test diff --git a/ssh/testdata/doc.go b/ssh/testdata/doc.go index fcae47ca68..ae7bd8b89e 100644 --- a/ssh/testdata/doc.go +++ b/ssh/testdata/doc.go @@ -5,4 +5,4 @@ // This package contains test data shared between the various subpackages of // the golang.org/x/crypto/ssh package. Under no circumstance should // this data be used for production code. -package testdata // import "golang.org/x/crypto/ssh/testdata" +package testdata diff --git a/twofish/twofish.go b/twofish/twofish.go index e4eeae17f4..6d0a3028d3 100644 --- a/twofish/twofish.go +++ b/twofish/twofish.go @@ -9,7 +9,7 @@ // implementation. Instead, use AES (from crypto/aes, if necessary in an AEAD // mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package twofish // import "golang.org/x/crypto/twofish" +package twofish // Twofish is defined in https://www.schneier.com/paper-twofish-paper.pdf [TWOFISH] diff --git a/xtea/cipher.go b/xtea/cipher.go index a4c2fd02b3..7b4f8aaa6b 100644 --- a/xtea/cipher.go +++ b/xtea/cipher.go @@ -12,7 +12,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package xtea // import "golang.org/x/crypto/xtea" +package xtea // For details, see http://www.cix.co.uk/~klockstone/xtea.pdf diff --git a/xts/xts.go b/xts/xts.go index 8c16a83014..d64f536f9d 100644 --- a/xts/xts.go +++ b/xts/xts.go @@ -21,7 +21,7 @@ // // Note that XTS is usually not appropriate for any use besides disk encryption. // Most users should use an AEAD mode like GCM (from crypto/cipher.NewGCM) instead. -package xts // import "golang.org/x/crypto/xts" +package xts import ( "crypto/cipher" From 9fadb0b165bd3b96d2a21e89d60ad458db3aeee0 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Thu, 4 Jul 2024 19:17:23 +0000 Subject: [PATCH 21/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I0024d2803498fe98af80f14a4476a504c9aa6b5b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/596835 Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Than McIntosh Auto-Submit: Gopher Robot --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index bef246f5ec..7e3b4eca74 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.21.0 - golang.org/x/term v0.21.0 + golang.org/x/sys v0.22.0 + golang.org/x/term v0.22.0 ) require golang.org/x/text v0.16.0 // indirect diff --git a/go.sum b/go.sum index b694c49e61..32e3e58b33 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= From d66d9c31b4ae80d173d1187a9e40c188788dbdbc Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 8 Jul 2024 16:01:01 +0000 Subject: [PATCH 22/57] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: I552ff9800e32294b25cc04ccc8fca3404ae3b93c Reviewed-on: https://go-review.googlesource.com/c/crypto/+/597095 Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee Reviewed-by: Roland Shoemaker --- x509roots/fallback/bundle.go | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index e56011afa0..35d7c1c550 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -1578,40 +1578,6 @@ XR4EzzffHqhmsYzmIGrv/EhOdJhCrylvLmrH+33RZjEizIYAfmaDDEL0vTSSwxrq T8p+ck0LcIymSLumoRT2+1hEmRSuqguTaaApJUqlyyvdimYHFngVV3Eb7PVHhPOe MTd61X8kreS8/f3MboPoDKi3QWwH3b08hpcv0g== -----END CERTIFICATE----- -# CN=GLOBALTRUST 2020,O=e-commerce monitoring GmbH,C=AT -# 9a296a5182d1d451a2e37f439b74daafa267523329f90f9a0d2007c334e23c9a ------BEGIN CERTIFICATE----- -MIIFgjCCA2qgAwIBAgILWku9WvtPilv6ZeUwDQYJKoZIhvcNAQELBQAwTTELMAkG -A1UEBhMCQVQxIzAhBgNVBAoTGmUtY29tbWVyY2UgbW9uaXRvcmluZyBHbWJIMRkw -FwYDVQQDExBHTE9CQUxUUlVTVCAyMDIwMB4XDTIwMDIxMDAwMDAwMFoXDTQwMDYx -MDAwMDAwMFowTTELMAkGA1UEBhMCQVQxIzAhBgNVBAoTGmUtY29tbWVyY2UgbW9u -aXRvcmluZyBHbWJIMRkwFwYDVQQDExBHTE9CQUxUUlVTVCAyMDIwMIICIjANBgkq -hkiG9w0BAQEFAAOCAg8AMIICCgKCAgEAri5WrRsc7/aVj6B3GyvTY4+ETUWiD59b -RatZe1E0+eyLinjF3WuvvcTfk0Uev5E4C64OFudBc/jbu9G4UeDLgztzOG53ig9Z -YybNpyrOVPu44sB8R85gfD+yc/LAGbaKkoc1DZAoouQVBGM+uq/ufF7MpotQsjj3 -QWPKzv9pj2gOlTblzLmMCcpL3TGQlsjMH/1WljTbjhzqLL6FLmPdqqmV0/0plRPw -yJiT2S0WR5ARg6I6IqIoV6Lr/sCMKKCmfecqQjuCgGOlYx8ZzHyyZqjC0203b+J+ -BlHZRYQfEs4kUmSFC0iAToexIiIwquuuvuAC4EDosEKAA1GqtH6qRNdDYfOiaxaJ -SaSjpCuKAsR49GiKweR6NrFvG5Ybd0mN1MkGco/PU+PcF4UgStyYJ9ORJitHHmkH -r96i5OTUawuzXnzUJIBHKWk7buis/UDr2O1xcSvy6Fgd60GXIsUf1DnQJ4+H4xj0 -4KlGDfV0OoIu0G4skaMxXDtG6nsEEFZegB31pWXogvziB4xiRfUg3kZwhqG8k9Me -dKZssCz3AwyIDMvUclOGvGBG85hqwvG/Q/lwIHfKN0F5VVJjjVsSn8VoxIidrPIw -q7ejMZdnrY8XD2zHc+0klGvIg5rQmjdJBKuxFshsSUktq6HQjJLyQUp5ISXbY9e2 -nKd+Qmn7OmMCAwEAAaNjMGEwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMC -AQYwHQYDVR0OBBYEFNwuH9FhN3nkq9XVsxJxaD1qaJwiMB8GA1UdIwQYMBaAFNwu -H9FhN3nkq9XVsxJxaD1qaJwiMA0GCSqGSIb3DQEBCwUAA4ICAQCR8EICaEDuw2jA -VC/f7GLDw56KoDEoqoOOpFaWEhCGVrqXctJUMHytGdUdaG/7FELYjQ7ztdGl4wJC -XtzoRlgHNQIw4Lx0SsFDKv/bGtCwr2zD/cuz9X9tAy5ZVp0tLTWMstZDFyySCstd -6IwPS3BD0IL/qMy/pJTAvoe9iuOTe8aPmxadJ2W8esVCgmxcB9CpwYhgROmYhRZf -+I/KARDOJcP5YBugxZfD0yyIMaK9MOzQ0MAS8cE54+X1+NZK3TTN+2/BT+MAi1bi -kvcoskJ3ciNnxz8RFbLEAwW+uxF7Cr+obuf/WEPPm2eggAe2HcqtbepBEX4tdJP7 -wry+UUTF72glJ4DjyKDUEuzZpTcdN3y0kcra1LGWge9oXHYQSa9+pTeAsRxSvTOB -TI/53WXZFM2KJVj04sWDpQmQ1GwUY7VA3+vA/MRYfg0UFodUJ25W5HCEuGwyEn6C -MUO+1918oa2u1qsgEu8KwxCMSZY13At1XrFP1U80DhEgB3VDRemjEdqso5nCtnkn -4rnvyOL2NSl6dPrFf4IFYqYK6miyeUcGbvJXqBUzxvd4Sj1Ce2t+/vdG6tHrju+I -aFvowdlxfv1k7/9nR4hYJS8+hge9+6jlgqispdNpQ80xiEmEU5LAsTkbOYMBMMTy -qfrQA71yN2BWHzZ8vTmR9W0Nv3vXkg== ------END CERTIFICATE----- # CN=GTS Root R1,O=Google Trust Services LLC,C=US # d947432abde7b7fa90fc2e6b59101b1280e0e1c7e4e40fa3c6887fff57a7f4cf -----BEGIN CERTIFICATE----- From f2bc3a617ab63149578b696bd2db66ae298b3ba1 Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Tue, 16 Jul 2024 10:13:29 -0400 Subject: [PATCH 23/57] x509roots/fallback/internal/goissue52287: delete By now Go 1.19 isn't supported, so there's no need to work around go.dev/issue/52287 in this module anymore. For golang/go#57792. For golang/go#52287. Change-Id: I3999cdb9ca419a2ab897c9143a4ec31f59da7d80 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598495 TryBot-Result: Gopher Robot Run-TryBot: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Dmitri Shuralyov Reviewed-by: Roland Shoemaker --- x509roots/fallback/internal/goissue52287/goissue52287.go | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 x509roots/fallback/internal/goissue52287/goissue52287.go diff --git a/x509roots/fallback/internal/goissue52287/goissue52287.go b/x509roots/fallback/internal/goissue52287/goissue52287.go deleted file mode 100644 index d946a527db..0000000000 --- a/x509roots/fallback/internal/goissue52287/goissue52287.go +++ /dev/null @@ -1,8 +0,0 @@ -// Copyright 2023 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package goissue52287 is an empty internal package. -// It exists only to work around go.dev/issue/52287 and -// can be removed after Go 1.19 stops being supported. -package goissue52287 From 80fd97208db0a6f1c2dccfc63ccde57b4e994875 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 16 Jul 2024 11:35:18 -0400 Subject: [PATCH 24/57] LICENSE: update per Google Legal Very minor tweaks: - Remove (c) pseudosymbol. - Remove "All Rights Reserved." - Change "Google Inc." (no longer exists) to "Google LLC". [git-generate] echo ' ,s/\(c\) // ,s/ All rights reserved.// ,s/Google Inc./Google LLC/ w q ' | sam -d LICENSE Change-Id: I6e885650c5701597f57dbf00c2abdcc7b393a703 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598520 LUCI-TryBot-Result: Go LUCI Reviewed-by: Ian Lance Taylor Auto-Submit: Russ Cox --- LICENSE | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index 6a66aea5ea..2a7cf70da6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. From e983fa27418787af5c51d172b508cd85bc6644d0 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Mon, 24 Jun 2024 18:56:28 -0400 Subject: [PATCH 25/57] sha3: Avo port of keccakf_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: BASE="d66d9c31b4ae80d173d1187a9e40c188788dbdbc" go tool asm -o /dev/null -debug \ <(git cat-file -p "$BASE":sha3/keccakf_amd64.s) \ > /tmp/reference.s go tool asm -o /dev/null -debug \ sha3/keccakf_amd64.s \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I1c0ea516531355263b83d3b66a37df090e293cea Reviewed-on: https://go-review.googlesource.com/c/crypto/+/594655 Reviewed-by: Cherry Mui Reviewed-by: Filippo Valsorda Reviewed-by: Russell Webb Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda --- sha3/_asm/go.mod | 15 + sha3/_asm/go.sum | 12 + sha3/_asm/keccakf_amd64_asm.go | 438 +++ sha3/keccakf_amd64.s | 5787 +++++++++++++++++++++++++++++--- 4 files changed, 5873 insertions(+), 379 deletions(-) create mode 100644 sha3/_asm/go.mod create mode 100644 sha3/_asm/go.sum create mode 100644 sha3/_asm/keccakf_amd64_asm.go diff --git a/sha3/_asm/go.mod b/sha3/_asm/go.mod new file mode 100644 index 0000000000..265a88d077 --- /dev/null +++ b/sha3/_asm/go.mod @@ -0,0 +1,15 @@ +module sha3/_asm + +go 1.22 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.25.0 +) + +require ( + golang.org/x/mod v0.19.0 // indirect + golang.org/x/sync v0.7.0 // indirect + golang.org/x/sys v0.22.0 // indirect + golang.org/x/tools v0.23.0 // indirect +) diff --git a/sha3/_asm/go.sum b/sha3/_asm/go.sum new file mode 100644 index 0000000000..a2552b8eb9 --- /dev/null +++ b/sha3/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= +golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8= +golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg= +golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= diff --git a/sha3/_asm/keccakf_amd64_asm.go b/sha3/_asm/keccakf_amd64_asm.go new file mode 100644 index 0000000000..78e931f757 --- /dev/null +++ b/sha3/_asm/keccakf_amd64_asm.go @@ -0,0 +1,438 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This code was translated into a form compatible with 6a from the public +// domain sources at https://github.com/gvanas/KeccakCodePackage + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/sha3" +) + +//go:generate go run . -out ../keccakf_amd64.s -pkg sha3 + +// Round Constants for use in the ι step. +var RoundConstants = [24]uint64{ + 0x0000000000000001, + 0x0000000000008082, + 0x800000000000808A, + 0x8000000080008000, + 0x000000000000808B, + 0x0000000080000001, + 0x8000000080008081, + 0x8000000000008009, + 0x000000000000008A, + 0x0000000000000088, + 0x0000000080008009, + 0x000000008000000A, + 0x000000008000808B, + 0x800000000000008B, + 0x8000000000008089, + 0x8000000000008003, + 0x8000000000008002, + 0x8000000000000080, + 0x000000000000800A, + 0x800000008000000A, + 0x8000000080008081, + 0x8000000000008080, + 0x0000000080000001, + 0x8000000080008008, +} + +var ( + // Temporary registers + rT1 GPPhysical = RAX + + // Round vars + rpState = Mem{Base: RDI} + rpStack = Mem{Base: RSP} + + rDa = RBX + rDe = RCX + rDi = RDX + rDo = R8 + rDu = R9 + + rBa = R10 + rBe = R11 + rBi = R12 + rBo = R13 + rBu = R14 + + rCa = RSI + rCe = RBP + rCi = rBi + rCo = rBo + rCu = R15 +) + +const ( + _ba = iota * 8 + _be + _bi + _bo + _bu + _ga + _ge + _gi + _go + _gu + _ka + _ke + _ki + _ko + _ku + _ma + _me + _mi + _mo + _mu + _sa + _se + _si + _so + _su +) + +func main() { + Package("golang.org/x/crypto/sha3") + ConstraintExpr("amd64,!purego,gc") + keccakF1600() + Generate() +} + +func MOVQ_RBI_RCE() { MOVQ(rBi, rCe) } +func XORQ_RT1_RCA() { XORQ(rT1, rCa) } +func XORQ_RT1_RCE() { XORQ(rT1, rCe) } +func XORQ_RBA_RCU() { XORQ(rBa, rCu) } +func XORQ_RBE_RCU() { XORQ(rBe, rCu) } +func XORQ_RDU_RCU() { XORQ(rDu, rCu) } +func XORQ_RDA_RCA() { XORQ(rDa, rCa) } +func XORQ_RDE_RCE() { XORQ(rDe, rCe) } + +type ArgMacro func() + +func mKeccakRound( + iState, oState Mem, + rc U64, + B_RBI_RCE, G_RT1_RCA, G_RT1_RCE, G_RBA_RCU, + K_RT1_RCA, K_RT1_RCE, K_RBA_RCU, M_RT1_RCA, + M_RT1_RCE, M_RBE_RCU, S_RDU_RCU, S_RDA_RCA, + S_RDE_RCE ArgMacro, +) { + Comment("Prepare round") + MOVQ(rCe, rDa) + ROLQ(Imm(1), rDa) + + MOVQ(iState.Offset(_bi), rCi) + XORQ(iState.Offset(_gi), rDi) + XORQ(rCu, rDa) + XORQ(iState.Offset(_ki), rCi) + XORQ(iState.Offset(_mi), rDi) + XORQ(rDi, rCi) + + MOVQ(rCi, rDe) + ROLQ(Imm(1), rDe) + + MOVQ(iState.Offset(_bo), rCo) + XORQ(iState.Offset(_go), rDo) + XORQ(rCa, rDe) + XORQ(iState.Offset(_ko), rCo) + XORQ(iState.Offset(_mo), rDo) + XORQ(rDo, rCo) + + MOVQ(rCo, rDi) + ROLQ(Imm(1), rDi) + + MOVQ(rCu, rDo) + XORQ(rCe, rDi) + ROLQ(Imm(1), rDo) + + MOVQ(rCa, rDu) + XORQ(rCi, rDo) + ROLQ(Imm(1), rDu) + + Comment("Result b") + MOVQ(iState.Offset(_ba), rBa) + MOVQ(iState.Offset(_ge), rBe) + XORQ(rCo, rDu) + MOVQ(iState.Offset(_ki), rBi) + MOVQ(iState.Offset(_mo), rBo) + MOVQ(iState.Offset(_su), rBu) + XORQ(rDe, rBe) + ROLQ(Imm(44), rBe) + XORQ(rDi, rBi) + XORQ(rDa, rBa) + ROLQ(Imm(43), rBi) + + MOVQ(rBe, rCa) + MOVQ(rc, rT1) + ORQ(rBi, rCa) + XORQ(rBa, rT1) + XORQ(rT1, rCa) + MOVQ(rCa, oState.Offset(_ba)) + + XORQ(rDu, rBu) + ROLQ(Imm(14), rBu) + MOVQ(rBa, rCu) + ANDQ(rBe, rCu) + XORQ(rBu, rCu) + MOVQ(rCu, oState.Offset(_bu)) + + XORQ(rDo, rBo) + ROLQ(Imm(21), rBo) + MOVQ(rBo, rT1) + ANDQ(rBu, rT1) + XORQ(rBi, rT1) + MOVQ(rT1, oState.Offset(_bi)) + + NOTQ(rBi) + ORQ(rBa, rBu) + ORQ(rBo, rBi) + XORQ(rBo, rBu) + XORQ(rBe, rBi) + MOVQ(rBu, oState.Offset(_bo)) + MOVQ(rBi, oState.Offset(_be)) + B_RBI_RCE() + + Comment("Result g") + MOVQ(iState.Offset(_gu), rBe) + XORQ(rDu, rBe) + MOVQ(iState.Offset(_ka), rBi) + ROLQ(Imm(20), rBe) + XORQ(rDa, rBi) + ROLQ(Imm(3), rBi) + MOVQ(iState.Offset(_bo), rBa) + MOVQ(rBe, rT1) + ORQ(rBi, rT1) + XORQ(rDo, rBa) + MOVQ(iState.Offset(_me), rBo) + MOVQ(iState.Offset(_si), rBu) + ROLQ(Imm(28), rBa) + XORQ(rBa, rT1) + MOVQ(rT1, oState.Offset(_ga)) + G_RT1_RCA() + + XORQ(rDe, rBo) + ROLQ(Imm(45), rBo) + MOVQ(rBi, rT1) + ANDQ(rBo, rT1) + XORQ(rBe, rT1) + MOVQ(rT1, oState.Offset(_ge)) + G_RT1_RCE() + + XORQ(rDi, rBu) + ROLQ(Imm(61), rBu) + MOVQ(rBu, rT1) + ORQ(rBa, rT1) + XORQ(rBo, rT1) + MOVQ(rT1, oState.Offset(_go)) + + ANDQ(rBe, rBa) + XORQ(rBu, rBa) + MOVQ(rBa, oState.Offset(_gu)) + NOTQ(rBu) + G_RBA_RCU() + + ORQ(rBu, rBo) + XORQ(rBi, rBo) + MOVQ(rBo, oState.Offset(_gi)) + + Comment("Result k") + MOVQ(iState.Offset(_be), rBa) + MOVQ(iState.Offset(_gi), rBe) + MOVQ(iState.Offset(_ko), rBi) + MOVQ(iState.Offset(_mu), rBo) + MOVQ(iState.Offset(_sa), rBu) + XORQ(rDi, rBe) + ROLQ(Imm(6), rBe) + XORQ(rDo, rBi) + ROLQ(Imm(25), rBi) + MOVQ(rBe, rT1) + ORQ(rBi, rT1) + XORQ(rDe, rBa) + ROLQ(Imm(1), rBa) + XORQ(rBa, rT1) + MOVQ(rT1, oState.Offset(_ka)) + K_RT1_RCA() + + XORQ(rDu, rBo) + ROLQ(Imm(8), rBo) + MOVQ(rBi, rT1) + ANDQ(rBo, rT1) + XORQ(rBe, rT1) + MOVQ(rT1, oState.Offset(_ke)) + K_RT1_RCE() + + XORQ(rDa, rBu) + ROLQ(Imm(18), rBu) + NOTQ(rBo) + MOVQ(rBo, rT1) + ANDQ(rBu, rT1) + XORQ(rBi, rT1) + MOVQ(rT1, oState.Offset(_ki)) + + MOVQ(rBu, rT1) + ORQ(rBa, rT1) + XORQ(rBo, rT1) + MOVQ(rT1, oState.Offset(_ko)) + + ANDQ(rBe, rBa) + XORQ(rBu, rBa) + MOVQ(rBa, oState.Offset(_ku)) + K_RBA_RCU() + + Comment("Result m") + MOVQ(iState.Offset(_ga), rBe) + XORQ(rDa, rBe) + MOVQ(iState.Offset(_ke), rBi) + ROLQ(Imm(36), rBe) + XORQ(rDe, rBi) + MOVQ(iState.Offset(_bu), rBa) + ROLQ(Imm(10), rBi) + MOVQ(rBe, rT1) + MOVQ(iState.Offset(_mi), rBo) + ANDQ(rBi, rT1) + XORQ(rDu, rBa) + MOVQ(iState.Offset(_so), rBu) + ROLQ(Imm(27), rBa) + XORQ(rBa, rT1) + MOVQ(rT1, oState.Offset(_ma)) + M_RT1_RCA() + + XORQ(rDi, rBo) + ROLQ(Imm(15), rBo) + MOVQ(rBi, rT1) + ORQ(rBo, rT1) + XORQ(rBe, rT1) + MOVQ(rT1, oState.Offset(_me)) + M_RT1_RCE() + + XORQ(rDo, rBu) + ROLQ(Imm(56), rBu) + NOTQ(rBo) + MOVQ(rBo, rT1) + ORQ(rBu, rT1) + XORQ(rBi, rT1) + MOVQ(rT1, oState.Offset(_mi)) + + ORQ(rBa, rBe) + XORQ(rBu, rBe) + MOVQ(rBe, oState.Offset(_mu)) + + ANDQ(rBa, rBu) + XORQ(rBo, rBu) + MOVQ(rBu, oState.Offset(_mo)) + M_RBE_RCU() + + Comment("Result s") + MOVQ(iState.Offset(_bi), rBa) + MOVQ(iState.Offset(_go), rBe) + MOVQ(iState.Offset(_ku), rBi) + XORQ(rDi, rBa) + MOVQ(iState.Offset(_ma), rBo) + ROLQ(Imm(62), rBa) + XORQ(rDo, rBe) + MOVQ(iState.Offset(_se), rBu) + ROLQ(Imm(55), rBe) + + XORQ(rDu, rBi) + MOVQ(rBa, rDu) + XORQ(rDe, rBu) + ROLQ(Imm(2), rBu) + ANDQ(rBe, rDu) + XORQ(rBu, rDu) + MOVQ(rDu, oState.Offset(_su)) + + ROLQ(Imm(39), rBi) + S_RDU_RCU() + NOTQ(rBe) + XORQ(rDa, rBo) + MOVQ(rBe, rDa) + ANDQ(rBi, rDa) + XORQ(rBa, rDa) + MOVQ(rDa, oState.Offset(_sa)) + S_RDA_RCA() + + ROLQ(Imm(41), rBo) + MOVQ(rBi, rDe) + ORQ(rBo, rDe) + XORQ(rBe, rDe) + MOVQ(rDe, oState.Offset(_se)) + S_RDE_RCE() + + MOVQ(rBo, rDi) + MOVQ(rBu, rDo) + ANDQ(rBu, rDi) + ORQ(rBa, rDo) + XORQ(rBi, rDi) + XORQ(rBo, rDo) + MOVQ(rDi, oState.Offset(_si)) + MOVQ(rDo, oState.Offset(_so)) +} + +// keccakF1600 applies the Keccak permutation to a 1600b-wide +// state represented as a slice of 25 uint64s. +func keccakF1600() { + Implement("keccakF1600") + AllocLocal(200) + + Load(Param("a"), rpState.Base) + + Comment("Convert the user state into an internal state") + NOTQ(rpState.Offset(_be)) + NOTQ(rpState.Offset(_bi)) + NOTQ(rpState.Offset(_go)) + NOTQ(rpState.Offset(_ki)) + NOTQ(rpState.Offset(_mi)) + NOTQ(rpState.Offset(_sa)) + + Comment("Execute the KeccakF permutation") + MOVQ(rpState.Offset(_ba), rCa) + MOVQ(rpState.Offset(_be), rCe) + MOVQ(rpState.Offset(_bu), rCu) + + XORQ(rpState.Offset(_ga), rCa) + XORQ(rpState.Offset(_ge), rCe) + XORQ(rpState.Offset(_gu), rCu) + + XORQ(rpState.Offset(_ka), rCa) + XORQ(rpState.Offset(_ke), rCe) + XORQ(rpState.Offset(_ku), rCu) + + XORQ(rpState.Offset(_ma), rCa) + XORQ(rpState.Offset(_me), rCe) + XORQ(rpState.Offset(_mu), rCu) + + XORQ(rpState.Offset(_sa), rCa) + XORQ(rpState.Offset(_se), rCe) + MOVQ(rpState.Offset(_si), rDi) + MOVQ(rpState.Offset(_so), rDo) + XORQ(rpState.Offset(_su), rCu) + + for i, rc := range RoundConstants[:len(RoundConstants)-1] { + var iState, oState Mem + if i%2 == 0 { + iState, oState = rpState, rpStack + } else { + iState, oState = rpStack, rpState + } + mKeccakRound(iState, oState, U64(rc), MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) + } + mKeccakRound(rpStack, rpState, U64(RoundConstants[len(RoundConstants)-1]), NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP) + + Comment("Revert the internal state to the user state") + NOTQ(rpState.Offset(_be)) + NOTQ(rpState.Offset(_bi)) + NOTQ(rpState.Offset(_go)) + NOTQ(rpState.Offset(_ki)) + NOTQ(rpState.Offset(_mi)) + NOTQ(rpState.Offset(_sa)) + + RET() +} diff --git a/sha3/keccakf_amd64.s b/sha3/keccakf_amd64.s index 1f53938861..99e2f16e97 100644 --- a/sha3/keccakf_amd64.s +++ b/sha3/keccakf_amd64.s @@ -1,390 +1,5419 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run keccakf_amd64_asm.go -out ../keccakf_amd64.s -pkg sha3. DO NOT EDIT. //go:build amd64 && !purego && gc -// This code was translated into a form compatible with 6a from the public -// domain sources at https://github.com/gvanas/KeccakCodePackage - -// Offsets in state -#define _ba (0*8) -#define _be (1*8) -#define _bi (2*8) -#define _bo (3*8) -#define _bu (4*8) -#define _ga (5*8) -#define _ge (6*8) -#define _gi (7*8) -#define _go (8*8) -#define _gu (9*8) -#define _ka (10*8) -#define _ke (11*8) -#define _ki (12*8) -#define _ko (13*8) -#define _ku (14*8) -#define _ma (15*8) -#define _me (16*8) -#define _mi (17*8) -#define _mo (18*8) -#define _mu (19*8) -#define _sa (20*8) -#define _se (21*8) -#define _si (22*8) -#define _so (23*8) -#define _su (24*8) - -// Temporary registers -#define rT1 AX - -// Round vars -#define rpState DI -#define rpStack SP - -#define rDa BX -#define rDe CX -#define rDi DX -#define rDo R8 -#define rDu R9 - -#define rBa R10 -#define rBe R11 -#define rBi R12 -#define rBo R13 -#define rBu R14 - -#define rCa SI -#define rCe BP -#define rCi rBi -#define rCo rBo -#define rCu R15 - -#define MOVQ_RBI_RCE MOVQ rBi, rCe -#define XORQ_RT1_RCA XORQ rT1, rCa -#define XORQ_RT1_RCE XORQ rT1, rCe -#define XORQ_RBA_RCU XORQ rBa, rCu -#define XORQ_RBE_RCU XORQ rBe, rCu -#define XORQ_RDU_RCU XORQ rDu, rCu -#define XORQ_RDA_RCA XORQ rDa, rCa -#define XORQ_RDE_RCE XORQ rDe, rCe - -#define mKeccakRound(iState, oState, rc, B_RBI_RCE, G_RT1_RCA, G_RT1_RCE, G_RBA_RCU, K_RT1_RCA, K_RT1_RCE, K_RBA_RCU, M_RT1_RCA, M_RT1_RCE, M_RBE_RCU, S_RDU_RCU, S_RDA_RCA, S_RDE_RCE) \ - /* Prepare round */ \ - MOVQ rCe, rDa; \ - ROLQ $1, rDa; \ - \ - MOVQ _bi(iState), rCi; \ - XORQ _gi(iState), rDi; \ - XORQ rCu, rDa; \ - XORQ _ki(iState), rCi; \ - XORQ _mi(iState), rDi; \ - XORQ rDi, rCi; \ - \ - MOVQ rCi, rDe; \ - ROLQ $1, rDe; \ - \ - MOVQ _bo(iState), rCo; \ - XORQ _go(iState), rDo; \ - XORQ rCa, rDe; \ - XORQ _ko(iState), rCo; \ - XORQ _mo(iState), rDo; \ - XORQ rDo, rCo; \ - \ - MOVQ rCo, rDi; \ - ROLQ $1, rDi; \ - \ - MOVQ rCu, rDo; \ - XORQ rCe, rDi; \ - ROLQ $1, rDo; \ - \ - MOVQ rCa, rDu; \ - XORQ rCi, rDo; \ - ROLQ $1, rDu; \ - \ - /* Result b */ \ - MOVQ _ba(iState), rBa; \ - MOVQ _ge(iState), rBe; \ - XORQ rCo, rDu; \ - MOVQ _ki(iState), rBi; \ - MOVQ _mo(iState), rBo; \ - MOVQ _su(iState), rBu; \ - XORQ rDe, rBe; \ - ROLQ $44, rBe; \ - XORQ rDi, rBi; \ - XORQ rDa, rBa; \ - ROLQ $43, rBi; \ - \ - MOVQ rBe, rCa; \ - MOVQ rc, rT1; \ - ORQ rBi, rCa; \ - XORQ rBa, rT1; \ - XORQ rT1, rCa; \ - MOVQ rCa, _ba(oState); \ - \ - XORQ rDu, rBu; \ - ROLQ $14, rBu; \ - MOVQ rBa, rCu; \ - ANDQ rBe, rCu; \ - XORQ rBu, rCu; \ - MOVQ rCu, _bu(oState); \ - \ - XORQ rDo, rBo; \ - ROLQ $21, rBo; \ - MOVQ rBo, rT1; \ - ANDQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _bi(oState); \ - \ - NOTQ rBi; \ - ORQ rBa, rBu; \ - ORQ rBo, rBi; \ - XORQ rBo, rBu; \ - XORQ rBe, rBi; \ - MOVQ rBu, _bo(oState); \ - MOVQ rBi, _be(oState); \ - B_RBI_RCE; \ - \ - /* Result g */ \ - MOVQ _gu(iState), rBe; \ - XORQ rDu, rBe; \ - MOVQ _ka(iState), rBi; \ - ROLQ $20, rBe; \ - XORQ rDa, rBi; \ - ROLQ $3, rBi; \ - MOVQ _bo(iState), rBa; \ - MOVQ rBe, rT1; \ - ORQ rBi, rT1; \ - XORQ rDo, rBa; \ - MOVQ _me(iState), rBo; \ - MOVQ _si(iState), rBu; \ - ROLQ $28, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ga(oState); \ - G_RT1_RCA; \ - \ - XORQ rDe, rBo; \ - ROLQ $45, rBo; \ - MOVQ rBi, rT1; \ - ANDQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _ge(oState); \ - G_RT1_RCE; \ - \ - XORQ rDi, rBu; \ - ROLQ $61, rBu; \ - MOVQ rBu, rT1; \ - ORQ rBa, rT1; \ - XORQ rBo, rT1; \ - MOVQ rT1, _go(oState); \ - \ - ANDQ rBe, rBa; \ - XORQ rBu, rBa; \ - MOVQ rBa, _gu(oState); \ - NOTQ rBu; \ - G_RBA_RCU; \ - \ - ORQ rBu, rBo; \ - XORQ rBi, rBo; \ - MOVQ rBo, _gi(oState); \ - \ - /* Result k */ \ - MOVQ _be(iState), rBa; \ - MOVQ _gi(iState), rBe; \ - MOVQ _ko(iState), rBi; \ - MOVQ _mu(iState), rBo; \ - MOVQ _sa(iState), rBu; \ - XORQ rDi, rBe; \ - ROLQ $6, rBe; \ - XORQ rDo, rBi; \ - ROLQ $25, rBi; \ - MOVQ rBe, rT1; \ - ORQ rBi, rT1; \ - XORQ rDe, rBa; \ - ROLQ $1, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ka(oState); \ - K_RT1_RCA; \ - \ - XORQ rDu, rBo; \ - ROLQ $8, rBo; \ - MOVQ rBi, rT1; \ - ANDQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _ke(oState); \ - K_RT1_RCE; \ - \ - XORQ rDa, rBu; \ - ROLQ $18, rBu; \ - NOTQ rBo; \ - MOVQ rBo, rT1; \ - ANDQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _ki(oState); \ - \ - MOVQ rBu, rT1; \ - ORQ rBa, rT1; \ - XORQ rBo, rT1; \ - MOVQ rT1, _ko(oState); \ - \ - ANDQ rBe, rBa; \ - XORQ rBu, rBa; \ - MOVQ rBa, _ku(oState); \ - K_RBA_RCU; \ - \ - /* Result m */ \ - MOVQ _ga(iState), rBe; \ - XORQ rDa, rBe; \ - MOVQ _ke(iState), rBi; \ - ROLQ $36, rBe; \ - XORQ rDe, rBi; \ - MOVQ _bu(iState), rBa; \ - ROLQ $10, rBi; \ - MOVQ rBe, rT1; \ - MOVQ _mi(iState), rBo; \ - ANDQ rBi, rT1; \ - XORQ rDu, rBa; \ - MOVQ _so(iState), rBu; \ - ROLQ $27, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ma(oState); \ - M_RT1_RCA; \ - \ - XORQ rDi, rBo; \ - ROLQ $15, rBo; \ - MOVQ rBi, rT1; \ - ORQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _me(oState); \ - M_RT1_RCE; \ - \ - XORQ rDo, rBu; \ - ROLQ $56, rBu; \ - NOTQ rBo; \ - MOVQ rBo, rT1; \ - ORQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _mi(oState); \ - \ - ORQ rBa, rBe; \ - XORQ rBu, rBe; \ - MOVQ rBe, _mu(oState); \ - \ - ANDQ rBa, rBu; \ - XORQ rBo, rBu; \ - MOVQ rBu, _mo(oState); \ - M_RBE_RCU; \ - \ - /* Result s */ \ - MOVQ _bi(iState), rBa; \ - MOVQ _go(iState), rBe; \ - MOVQ _ku(iState), rBi; \ - XORQ rDi, rBa; \ - MOVQ _ma(iState), rBo; \ - ROLQ $62, rBa; \ - XORQ rDo, rBe; \ - MOVQ _se(iState), rBu; \ - ROLQ $55, rBe; \ - \ - XORQ rDu, rBi; \ - MOVQ rBa, rDu; \ - XORQ rDe, rBu; \ - ROLQ $2, rBu; \ - ANDQ rBe, rDu; \ - XORQ rBu, rDu; \ - MOVQ rDu, _su(oState); \ - \ - ROLQ $39, rBi; \ - S_RDU_RCU; \ - NOTQ rBe; \ - XORQ rDa, rBo; \ - MOVQ rBe, rDa; \ - ANDQ rBi, rDa; \ - XORQ rBa, rDa; \ - MOVQ rDa, _sa(oState); \ - S_RDA_RCA; \ - \ - ROLQ $41, rBo; \ - MOVQ rBi, rDe; \ - ORQ rBo, rDe; \ - XORQ rBe, rDe; \ - MOVQ rDe, _se(oState); \ - S_RDE_RCE; \ - \ - MOVQ rBo, rDi; \ - MOVQ rBu, rDo; \ - ANDQ rBu, rDi; \ - ORQ rBa, rDo; \ - XORQ rBi, rDi; \ - XORQ rBo, rDo; \ - MOVQ rDi, _si(oState); \ - MOVQ rDo, _so(oState) \ - // func keccakF1600(a *[25]uint64) -TEXT ·keccakF1600(SB), 0, $200-8 - MOVQ a+0(FP), rpState +TEXT ·keccakF1600(SB), $200-8 + MOVQ a+0(FP), DI // Convert the user state into an internal state - NOTQ _be(rpState) - NOTQ _bi(rpState) - NOTQ _go(rpState) - NOTQ _ki(rpState) - NOTQ _mi(rpState) - NOTQ _sa(rpState) + NOTQ 8(DI) + NOTQ 16(DI) + NOTQ 64(DI) + NOTQ 96(DI) + NOTQ 136(DI) + NOTQ 160(DI) // Execute the KeccakF permutation - MOVQ _ba(rpState), rCa - MOVQ _be(rpState), rCe - MOVQ _bu(rpState), rCu - - XORQ _ga(rpState), rCa - XORQ _ge(rpState), rCe - XORQ _gu(rpState), rCu - - XORQ _ka(rpState), rCa - XORQ _ke(rpState), rCe - XORQ _ku(rpState), rCu - - XORQ _ma(rpState), rCa - XORQ _me(rpState), rCe - XORQ _mu(rpState), rCu - - XORQ _sa(rpState), rCa - XORQ _se(rpState), rCe - MOVQ _si(rpState), rDi - MOVQ _so(rpState), rDo - XORQ _su(rpState), rCu - - mKeccakRound(rpState, rpStack, $0x0000000000000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000000008082, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x800000000000808a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000080008000, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000008a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000000000088, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x0000000080008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x000000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000008000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x800000000000008b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000000008089, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008003, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000000008002, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000000080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000800a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x800000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000080008008, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP) + MOVQ (DI), SI + MOVQ 8(DI), BP + MOVQ 32(DI), R15 + XORQ 40(DI), SI + XORQ 48(DI), BP + XORQ 72(DI), R15 + XORQ 80(DI), SI + XORQ 88(DI), BP + XORQ 112(DI), R15 + XORQ 120(DI), SI + XORQ 128(DI), BP + XORQ 152(DI), R15 + XORQ 160(DI), SI + XORQ 168(DI), BP + MOVQ 176(DI), DX + MOVQ 184(DI), R8 + XORQ 192(DI), R15 - // Revert the internal state to the user state - NOTQ _be(rpState) - NOTQ _bi(rpState) - NOTQ _go(rpState) - NOTQ _ki(rpState) - NOTQ _mi(rpState) - NOTQ _sa(rpState) + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000000000001, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000000008082, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x800000000000808a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008000, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000000000808b, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000080000001, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008081, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008009, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000000000008a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000000000088, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000080008009, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000008000000a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000008000808b, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x800000000000008b, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008089, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008003, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008002, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000000080, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x000000000000800a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x800000008000000a, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008081, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000000008080, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + MOVQ R12, BP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + XORQ R10, R15 + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + XORQ R11, R15 + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(DI), R12 + XORQ 56(DI), DX + XORQ R15, BX + XORQ 96(DI), R12 + XORQ 136(DI), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(DI), R13 + XORQ 64(DI), R8 + XORQ SI, CX + XORQ 104(DI), R13 + XORQ 144(DI), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (DI), R10 + MOVQ 48(DI), R11 + XORQ R13, R9 + MOVQ 96(DI), R12 + MOVQ 144(DI), R13 + MOVQ 192(DI), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x0000000080000001, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (SP) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(SP) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(SP) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(SP) + MOVQ R12, 8(SP) + MOVQ R12, BP + + // Result g + MOVQ 72(DI), R11 + XORQ R9, R11 + MOVQ 80(DI), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(DI), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(DI), R13 + MOVQ 176(DI), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(SP) + XORQ AX, SI + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(SP) + XORQ AX, BP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(SP) + NOTQ R14 + XORQ R10, R15 + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(SP) + + // Result k + MOVQ 8(DI), R10 + MOVQ 56(DI), R11 + MOVQ 104(DI), R12 + MOVQ 152(DI), R13 + MOVQ 160(DI), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(SP) + XORQ AX, SI + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(SP) + XORQ AX, BP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(SP) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(SP) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(SP) + XORQ R10, R15 + + // Result m + MOVQ 40(DI), R11 + XORQ BX, R11 + MOVQ 88(DI), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(DI), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(DI), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(DI), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(SP) + XORQ AX, SI + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(SP) + XORQ AX, BP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(SP) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(SP) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(SP) + XORQ R11, R15 + + // Result s + MOVQ 16(DI), R10 + MOVQ 64(DI), R11 + MOVQ 112(DI), R12 + XORQ DX, R10 + MOVQ 120(DI), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(DI), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(SP) + ROLQ $0x27, R12 + XORQ R9, R15 + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(SP) + XORQ BX, SI + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(SP) + XORQ CX, BP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(SP) + MOVQ R8, 184(SP) + + // Prepare round + MOVQ BP, BX + ROLQ $0x01, BX + MOVQ 16(SP), R12 + XORQ 56(SP), DX + XORQ R15, BX + XORQ 96(SP), R12 + XORQ 136(SP), DX + XORQ DX, R12 + MOVQ R12, CX + ROLQ $0x01, CX + MOVQ 24(SP), R13 + XORQ 64(SP), R8 + XORQ SI, CX + XORQ 104(SP), R13 + XORQ 144(SP), R8 + XORQ R8, R13 + MOVQ R13, DX + ROLQ $0x01, DX + MOVQ R15, R8 + XORQ BP, DX + ROLQ $0x01, R8 + MOVQ SI, R9 + XORQ R12, R8 + ROLQ $0x01, R9 + + // Result b + MOVQ (SP), R10 + MOVQ 48(SP), R11 + XORQ R13, R9 + MOVQ 96(SP), R12 + MOVQ 144(SP), R13 + MOVQ 192(SP), R14 + XORQ CX, R11 + ROLQ $0x2c, R11 + XORQ DX, R12 + XORQ BX, R10 + ROLQ $0x2b, R12 + MOVQ R11, SI + MOVQ $0x8000000080008008, AX + ORQ R12, SI + XORQ R10, AX + XORQ AX, SI + MOVQ SI, (DI) + XORQ R9, R14 + ROLQ $0x0e, R14 + MOVQ R10, R15 + ANDQ R11, R15 + XORQ R14, R15 + MOVQ R15, 32(DI) + XORQ R8, R13 + ROLQ $0x15, R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 16(DI) + NOTQ R12 + ORQ R10, R14 + ORQ R13, R12 + XORQ R13, R14 + XORQ R11, R12 + MOVQ R14, 24(DI) + MOVQ R12, 8(DI) + NOP + + // Result g + MOVQ 72(SP), R11 + XORQ R9, R11 + MOVQ 80(SP), R12 + ROLQ $0x14, R11 + XORQ BX, R12 + ROLQ $0x03, R12 + MOVQ 24(SP), R10 + MOVQ R11, AX + ORQ R12, AX + XORQ R8, R10 + MOVQ 128(SP), R13 + MOVQ 176(SP), R14 + ROLQ $0x1c, R10 + XORQ R10, AX + MOVQ AX, 40(DI) + NOP + XORQ CX, R13 + ROLQ $0x2d, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 48(DI) + NOP + XORQ DX, R14 + ROLQ $0x3d, R14 + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 64(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 72(DI) + NOTQ R14 + NOP + ORQ R14, R13 + XORQ R12, R13 + MOVQ R13, 56(DI) + + // Result k + MOVQ 8(SP), R10 + MOVQ 56(SP), R11 + MOVQ 104(SP), R12 + MOVQ 152(SP), R13 + MOVQ 160(SP), R14 + XORQ DX, R11 + ROLQ $0x06, R11 + XORQ R8, R12 + ROLQ $0x19, R12 + MOVQ R11, AX + ORQ R12, AX + XORQ CX, R10 + ROLQ $0x01, R10 + XORQ R10, AX + MOVQ AX, 80(DI) + NOP + XORQ R9, R13 + ROLQ $0x08, R13 + MOVQ R12, AX + ANDQ R13, AX + XORQ R11, AX + MOVQ AX, 88(DI) + NOP + XORQ BX, R14 + ROLQ $0x12, R14 + NOTQ R13 + MOVQ R13, AX + ANDQ R14, AX + XORQ R12, AX + MOVQ AX, 96(DI) + MOVQ R14, AX + ORQ R10, AX + XORQ R13, AX + MOVQ AX, 104(DI) + ANDQ R11, R10 + XORQ R14, R10 + MOVQ R10, 112(DI) + NOP + + // Result m + MOVQ 40(SP), R11 + XORQ BX, R11 + MOVQ 88(SP), R12 + ROLQ $0x24, R11 + XORQ CX, R12 + MOVQ 32(SP), R10 + ROLQ $0x0a, R12 + MOVQ R11, AX + MOVQ 136(SP), R13 + ANDQ R12, AX + XORQ R9, R10 + MOVQ 184(SP), R14 + ROLQ $0x1b, R10 + XORQ R10, AX + MOVQ AX, 120(DI) + NOP + XORQ DX, R13 + ROLQ $0x0f, R13 + MOVQ R12, AX + ORQ R13, AX + XORQ R11, AX + MOVQ AX, 128(DI) + NOP + XORQ R8, R14 + ROLQ $0x38, R14 + NOTQ R13 + MOVQ R13, AX + ORQ R14, AX + XORQ R12, AX + MOVQ AX, 136(DI) + ORQ R10, R11 + XORQ R14, R11 + MOVQ R11, 152(DI) + ANDQ R10, R14 + XORQ R13, R14 + MOVQ R14, 144(DI) + NOP + + // Result s + MOVQ 16(SP), R10 + MOVQ 64(SP), R11 + MOVQ 112(SP), R12 + XORQ DX, R10 + MOVQ 120(SP), R13 + ROLQ $0x3e, R10 + XORQ R8, R11 + MOVQ 168(SP), R14 + ROLQ $0x37, R11 + XORQ R9, R12 + MOVQ R10, R9 + XORQ CX, R14 + ROLQ $0x02, R14 + ANDQ R11, R9 + XORQ R14, R9 + MOVQ R9, 192(DI) + ROLQ $0x27, R12 + NOP + NOTQ R11 + XORQ BX, R13 + MOVQ R11, BX + ANDQ R12, BX + XORQ R10, BX + MOVQ BX, 160(DI) + NOP + ROLQ $0x29, R13 + MOVQ R12, CX + ORQ R13, CX + XORQ R11, CX + MOVQ CX, 168(DI) + NOP + MOVQ R13, DX + MOVQ R14, R8 + ANDQ R14, DX + ORQ R10, R8 + XORQ R12, DX + XORQ R13, R8 + MOVQ DX, 176(DI) + MOVQ R8, 184(DI) + + // Revert the internal state to the user state + NOTQ 8(DI) + NOTQ 16(DI) + NOTQ 64(DI) + NOTQ 96(DI) + NOTQ 136(DI) + NOTQ 160(DI) RET From 68797222744d17ebda05804c6a5912bd129b8112 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Tue, 16 Jul 2024 14:05:34 -0400 Subject: [PATCH 26/57] ssh: remove go 1.21+ dependency on slices Fixes golang/go#68469 Change-Id: Ieea3c444b0458d169a6ff224e59b3b815264de89 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598775 LUCI-TryBot-Result: Go LUCI Reviewed-by: Nicola Murino Auto-Submit: Nicola Murino Reviewed-by: Cherry Mui Reviewed-by: Carlos Amedee --- ssh/server_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ssh/server_test.go b/ssh/server_test.go index 9057a9b5f0..b6d8ab3333 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -9,7 +9,7 @@ import ( "fmt" "io" "net" - "slices" + "reflect" "strings" "sync/atomic" "testing" @@ -294,7 +294,7 @@ func TestBannerError(t *testing.T) { "banner from PublicKeyCallback", "banner from KeyboardInteractiveCallback", } - if !slices.Equal(banners, wantBanners) { + if !reflect.DeepEqual(banners, wantBanners) { t.Errorf("got banners:\n%q\nwant banners:\n%q", banners, wantBanners) } } From bb80217080b0e04c6e73e5dcd9f3a9bb11fe23f6 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 21 Jul 2024 11:43:44 +0200 Subject: [PATCH 27/57] ssh: don't use dsa keys in integration tests DSA has been disabled by default since OpenSSH 9.8, so tests fail with newer versions of OpenSSH Change-Id: I57b9abde8845cd05116a637a21cbbb8af740b2e0 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/599955 Reviewed-by: Dmitri Shuralyov Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker --- ssh/agent/client_test.go | 10 +++++----- ssh/test/agent_unix_test.go | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ssh/agent/client_test.go b/ssh/agent/client_test.go index ae03df1a63..f0ffd59592 100644 --- a/ssh/agent/client_test.go +++ b/ssh/agent/client_test.go @@ -165,9 +165,9 @@ func testAgentInterface(t *testing.T, agent ExtendedAgent, key interface{}, cert sig, err := agent.Sign(pubKey, data) if err != nil { t.Logf("sign failed with key type %q", pubKey.Type()) - // In integration tests ssh-dss and ssh-rsa (SHA1 signatures) may be - // disabled for security reasons, we check SHA-2 variants later. - if pubKey.Type() != ssh.KeyAlgoDSA && pubKey.Type() != ssh.KeyAlgoRSA && pubKey.Type() != ssh.CertAlgoRSAv01 { + // In integration tests ssh-rsa (SHA1 signatures) may be disabled for + // security reasons, we check SHA-2 variants later. + if pubKey.Type() != ssh.KeyAlgoRSA && pubKey.Type() != ssh.CertAlgoRSAv01 { t.Fatalf("Sign(%s): %v", pubKey.Type(), err) } } else { @@ -251,7 +251,7 @@ func TestMalformedRequests(t *testing.T) { } func TestAgent(t *testing.T) { - for _, keyType := range []string{"rsa", "dsa", "ecdsa", "ed25519"} { + for _, keyType := range []string{"rsa", "ecdsa", "ed25519"} { testOpenSSHAgent(t, testPrivateKeys[keyType], nil, 0) testKeyringAgent(t, testPrivateKeys[keyType], nil, 0) } @@ -409,7 +409,7 @@ func testLockAgent(agent Agent, t *testing.T) { if err := agent.Add(AddedKey{PrivateKey: testPrivateKeys["rsa"], Comment: "comment 1"}); err != nil { t.Errorf("Add: %v", err) } - if err := agent.Add(AddedKey{PrivateKey: testPrivateKeys["dsa"], Comment: "comment dsa"}); err != nil { + if err := agent.Add(AddedKey{PrivateKey: testPrivateKeys["ecdsa"], Comment: "comment ecdsa"}); err != nil { t.Errorf("Add: %v", err) } if keys, err := agent.List(); err != nil { diff --git a/ssh/test/agent_unix_test.go b/ssh/test/agent_unix_test.go index a9c4893f7d..9257bfe1bc 100644 --- a/ssh/test/agent_unix_test.go +++ b/ssh/test/agent_unix_test.go @@ -20,17 +20,17 @@ func TestAgentForward(t *testing.T) { defer conn.Close() keyring := agent.NewKeyring() - if err := keyring.Add(agent.AddedKey{PrivateKey: testPrivateKeys["dsa"]}); err != nil { + if err := keyring.Add(agent.AddedKey{PrivateKey: testPrivateKeys["ecdsa"]}); err != nil { t.Fatalf("Error adding key: %s", err) } if err := keyring.Add(agent.AddedKey{ - PrivateKey: testPrivateKeys["dsa"], + PrivateKey: testPrivateKeys["ecdsa"], ConfirmBeforeUse: true, LifetimeSecs: 3600, }); err != nil { t.Fatalf("Error adding key with constraints: %s", err) } - pub := testPublicKeys["dsa"] + pub := testPublicKeys["ecdsa"] sess, err := conn.NewSession() if err != nil { From 3375612bf41a8cdb0cdfdc21e6ca2c7ae0f764b5 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Tue, 16 Jul 2024 15:48:14 -0400 Subject: [PATCH 28/57] ssh: add support for unpadded RSA signatures The original SSH RFC 4253 explicitly disallows padding. This applies to ssh-rsa signatures. The updated SSH RFC 8332 which defines the SHA2 RSA signature variants explicitly calls out the existence of signers who produce short signatures and specifies that verifiers may allow this behavior. In practice, PuTTY 0.81 and prior versions, as well as SSH.NET prior to 2024.1.0 always generated short signatures. Furthermore, PuTTY is embedded in other software like WinSCP and FileZilla, which are updated on their own schedules as well. This leads to occasional unexplained login errors, when using RSA keys. OpenSSH server allows these short signatures for all RSA algorithms. Fixes golang/go#68286 Change-Id: Ia60ece21bf9c111c490fac0c066443ed5ff7dd29 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/598534 Reviewed-by: Nicola Murino Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI --- ssh/keys.go | 44 +++++++++++++++++++++++++++++++++++++++++++- ssh/keys_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/ssh/keys.go b/ssh/keys.go index 7967665f17..98e6706d5d 100644 --- a/ssh/keys.go +++ b/ssh/keys.go @@ -488,7 +488,49 @@ func (r *rsaPublicKey) Verify(data []byte, sig *Signature) error { h := hash.New() h.Write(data) digest := h.Sum(nil) - return rsa.VerifyPKCS1v15((*rsa.PublicKey)(r), hash, digest, sig.Blob) + + // Signatures in PKCS1v15 must match the key's modulus in + // length. However with SSH, some signers provide RSA + // signatures which are missing the MSB 0's of the bignum + // represented. With ssh-rsa signatures, this is encouraged by + // the spec (even though e.g. OpenSSH will give the full + // length unconditionally). With rsa-sha2-* signatures, the + // verifier is allowed to support these, even though they are + // out of spec. See RFC 4253 Section 6.6 for ssh-rsa and RFC + // 8332 Section 3 for rsa-sha2-* details. + // + // In practice: + // * OpenSSH always allows "short" signatures: + // https://github.com/openssh/openssh-portable/blob/V_9_8_P1/ssh-rsa.c#L526 + // but always generates padded signatures: + // https://github.com/openssh/openssh-portable/blob/V_9_8_P1/ssh-rsa.c#L439 + // + // * PuTTY versions 0.81 and earlier will generate short + // signatures for all RSA signature variants. Note that + // PuTTY is embedded in other software, such as WinSCP and + // FileZilla. At the time of writing, a patch has been + // applied to PuTTY to generate padded signatures for + // rsa-sha2-*, but not yet released: + // https://git.tartarus.org/?p=simon/putty.git;a=commitdiff;h=a5bcf3d384e1bf15a51a6923c3724cbbee022d8e + // + // * SSH.NET versions 2024.0.0 and earlier will generate short + // signatures for all RSA signature variants, fixed in 2024.1.0: + // https://github.com/sshnet/SSH.NET/releases/tag/2024.1.0 + // + // As a result, we pad these up to the key size by inserting + // leading 0's. + // + // Note that support for short signatures with rsa-sha2-* may + // be removed in the future due to such signatures not being + // allowed by the spec. + blob := sig.Blob + keySize := (*rsa.PublicKey)(r).Size() + if len(blob) < keySize { + padded := make([]byte, keySize) + copy(padded[keySize-len(blob):], blob) + blob = padded + } + return rsa.VerifyPKCS1v15((*rsa.PublicKey)(r), hash, digest, blob) } func (r *rsaPublicKey) CryptoPublicKey() crypto.PublicKey { diff --git a/ssh/keys_test.go b/ssh/keys_test.go index 7b14429e17..7d5b86ff0d 100644 --- a/ssh/keys_test.go +++ b/ssh/keys_test.go @@ -154,6 +154,44 @@ func TestKeySignWithAlgorithmVerify(t *testing.T) { } } +func TestKeySignWithShortSignature(t *testing.T) { + signer := testSigners["rsa"].(AlgorithmSigner) + pub := signer.PublicKey() + // Note: data obtained by empirically trying until a result + // starting with 0 appeared + tests := []struct { + algorithm string + data []byte + }{ + { + algorithm: KeyAlgoRSA, + data: []byte("sign me92"), + }, + { + algorithm: KeyAlgoRSASHA256, + data: []byte("sign me294"), + }, + { + algorithm: KeyAlgoRSASHA512, + data: []byte("sign me60"), + }, + } + + for _, tt := range tests { + sig, err := signer.SignWithAlgorithm(rand.Reader, tt.data, tt.algorithm) + if err != nil { + t.Fatalf("Sign(%T): %v", signer, err) + } + if sig.Blob[0] != 0 { + t.Errorf("%s: Expected signature with a leading 0", tt.algorithm) + } + sig.Blob = sig.Blob[1:] + if err := pub.Verify(tt.data, sig); err != nil { + t.Errorf("publicKey.Verify(%s): %v", tt.algorithm, err) + } + } +} + func TestParseRSAPrivateKey(t *testing.T) { key := testPrivateKeys["rsa"] From 5bcd010f1cdaf2257509bfb7b43eaad62b7928fd Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Tue, 6 Aug 2024 15:29:05 +0000 Subject: [PATCH 29/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Iae75e5fcbcfe3709820dd66638a763f662f8d939 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/603396 Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: David Chase --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 7e3b4eca74..e206587663 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.22.0 - golang.org/x/term v0.22.0 + golang.org/x/sys v0.23.0 + golang.org/x/term v0.23.0 ) -require golang.org/x/text v0.16.0 // indirect +require golang.org/x/text v0.17.0 // indirect diff --git a/go.sum b/go.sum index 32e3e58b33..c7231381e6 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= -golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= +golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= From b2d3a6a4b4d36521cd7f653879cf6981e7c5c340 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 4 Aug 2024 16:01:34 +0200 Subject: [PATCH 30/57] ssh/agent: ensure to not add duplicated keys When adding a new key, if we already have a Signer with the same public key, we now replace it with the new one instead of duplicating it. Before this change we had this: $ ssh-add -l 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA-CERT) $ ssh-add /home/nicola/ssh_certs/id_rsa Identity added: /home/nicola/ssh_certs/id_rsa (nicola@p1) Certificate added: /home/nicola/ssh_certs/id_rsa-cert.pub (myid) $ ssh-add -l 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA-CERT) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA) 3072 SHA256:bsBRHC/xgiqBJdSuvSTNpJNLTISP/G356jNMCRYC5Es nicola@p1 (RSA-CERT) Change-Id: Iad1b1a6dc94f68f53f05d7d1172f0017839976fc Reviewed-on: https://go-review.googlesource.com/c/crypto/+/602955 Reviewed-by: Filippo Valsorda Auto-Submit: Nicola Murino Reviewed-by: David Chase Reviewed-by: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- ssh/agent/keyring.go | 9 ++++++++ ssh/agent/keyring_test.go | 46 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/ssh/agent/keyring.go b/ssh/agent/keyring.go index 21bfa870fa..c1b4361087 100644 --- a/ssh/agent/keyring.go +++ b/ssh/agent/keyring.go @@ -175,6 +175,15 @@ func (r *keyring) Add(key AddedKey) error { p.expire = &t } + // If we already have a Signer with the same public key, replace it with the + // new one. + for idx, k := range r.keys { + if bytes.Equal(k.signer.PublicKey().Marshal(), p.signer.PublicKey().Marshal()) { + r.keys[idx] = p + return nil + } + } + r.keys = append(r.keys, p) return nil diff --git a/ssh/agent/keyring_test.go b/ssh/agent/keyring_test.go index e5d50e7e0d..e9c90a3131 100644 --- a/ssh/agent/keyring_test.go +++ b/ssh/agent/keyring_test.go @@ -29,6 +29,10 @@ func validateListedKeys(t *testing.T, a Agent, expectedKeys []string) { t.Fatalf("failed to list keys: %v", err) return } + if len(listedKeys) != len(expectedKeys) { + t.Fatalf("expeted %d key, got %d", len(expectedKeys), len(listedKeys)) + return + } actualKeys := make(map[string]bool) for _, key := range listedKeys { actualKeys[key.Comment] = true @@ -74,3 +78,45 @@ func TestKeyringAddingAndRemoving(t *testing.T) { } validateListedKeys(t, k, []string{}) } + +func TestAddDuplicateKey(t *testing.T) { + keyNames := []string{"rsa", "user"} + + k := NewKeyring() + for _, keyName := range keyNames { + addTestKey(t, k, keyName) + } + validateListedKeys(t, k, keyNames) + // Add the keys again. + for _, keyName := range keyNames { + addTestKey(t, k, keyName) + } + validateListedKeys(t, k, keyNames) + // Add an existing key with an updated comment. + keyName := keyNames[0] + addedKey := AddedKey{ + PrivateKey: testPrivateKeys[keyName], + Comment: "comment updated", + } + err := k.Add(addedKey) + if err != nil { + t.Fatalf("failed to add key %q: %v", keyName, err) + } + // Check the that key is found and the comment was updated. + keys, err := k.List() + if err != nil { + t.Fatalf("failed to list keys: %v", err) + } + if len(keys) != len(keyNames) { + t.Fatalf("expected %d keys, got %d", len(keyNames), len(keys)) + } + isFound := false + for _, key := range keys { + if key.Comment == addedKey.Comment { + isFound = true + } + } + if !isFound { + t.Fatal("key with the updated comment not found") + } +} From bf5f14f5457a10932761a9abe55eb6e2d05e092a Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 2 Sep 2024 16:01:11 +0000 Subject: [PATCH 31/57] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: I95cf0b3e86f1e013d486a0bbd050a8b4bea5d6e9 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/610060 Reviewed-by: Roland Shoemaker Auto-Submit: Gopher Robot LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- x509roots/fallback/bundle.go | 107 +++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index 35d7c1c550..80ce10fe3c 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -2794,6 +2794,79 @@ I50mD1hp/Ed+stCNi5O/KU9DaXR2Z0vPB4zmAve14bRDtUstFJ/53CYNv6ZHdAbY iNE6KTCEztI5gGIbqMdXSbxqVVFnFUq+NQfk1XWYN3kwFNspnWzFacxHVaIw98xc f8LDmBxrThaA63p4ZUWiABqvDA1VZDRIuJK58bRQKfJPIx/abKwfROHdI3hRW8cW -----END CERTIFICATE----- +# CN=SecureSign Root CA12,O=Cybertrust Japan Co.\, Ltd.,C=JP +# 3f034bb5704d44b2d08545a02057de93ebf3905fce721acbc730c06ddaee904e +-----BEGIN CERTIFICATE----- +MIIDcjCCAlqgAwIBAgIUZvnHwa/swlG07VOX5uaCwysckBYwDQYJKoZIhvcNAQEL +BQAwUTELMAkGA1UEBhMCSlAxIzAhBgNVBAoTGkN5YmVydHJ1c3QgSmFwYW4gQ28u +LCBMdGQuMR0wGwYDVQQDExRTZWN1cmVTaWduIFJvb3QgQ0ExMjAeFw0yMDA0MDgw +NTM2NDZaFw00MDA0MDgwNTM2NDZaMFExCzAJBgNVBAYTAkpQMSMwIQYDVQQKExpD +eWJlcnRydXN0IEphcGFuIENvLiwgTHRkLjEdMBsGA1UEAxMUU2VjdXJlU2lnbiBS +b290IENBMTIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC6OcE3emhF +KxS06+QT61d1I02PJC0W6K6OyX2kVzsqdiUzg2zqMoqUm048luT9Ub+ZyZN+v/mt +p7JIKwccJ/VMvHASd6SFVLX9kHrko+RRWAPNEHl57muTH2SOa2SroxPjcf59q5zd +J1M3s6oYwlkm7Fsf0uZlfO+TvdhYXAvA42VvPMfKWeP+bl+sg779XSVOKik71gur +FzJ4pOE+lEa+Ym6b3kaosRbnhW70CEBFEaCeVESE99g2zvVQR9wsMJvuwPWW0v4J +hscGWa5Pro4RmHvzC1KqYiaqId+OJTN5lxZJjfU+1UefNzFJM3IFTQy2VYzxV4+K +h9GtxRESOaCtAgMBAAGjQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQD +AgEGMB0GA1UdDgQWBBRXNPN0zwRL1SXm8UC2LEzZLemgrTANBgkqhkiG9w0BAQsF +AAOCAQEAPrvbFxbS8hQBICw4g0utvsqFepq2m2um4fylOqyttCg6r9cBg0krY6Ld +mmQOmFxv3Y67ilQiLUoT865AQ9tPkbeGGuwAtEGBpE/6aouIs3YIcipJQMPTw4WJ +mBClnW8Zt7vPemVV2zfrPIpyMpcemik+rY3moxtt9XUa5rBouVui7mlHJzWhhpmA +8zNL4WukJsPvdFlseqJkth5Ew1DgDzk9qTPxpfPSvWKErI4cqc1avTc7bgoitPQV +55FYxTpE05Uo2cBl6XLK0A+9H7MV2anjpEcJnuDLN/v9vZfVvhgaaaI5gdka9at/ +yOPiZwud9AzqVN/Ssq+xIvEg37xEHA== +-----END CERTIFICATE----- +# CN=SecureSign Root CA14,O=Cybertrust Japan Co.\, Ltd.,C=JP +# 4b009c1034494f9ab56bba3ba1d62731fc4d20d8955adcec10a925607261e338 +-----BEGIN CERTIFICATE----- +MIIFcjCCA1qgAwIBAgIUZNtaDCBO6Ncpd8hQJ6JaJ90t8sswDQYJKoZIhvcNAQEM +BQAwUTELMAkGA1UEBhMCSlAxIzAhBgNVBAoTGkN5YmVydHJ1c3QgSmFwYW4gQ28u +LCBMdGQuMR0wGwYDVQQDExRTZWN1cmVTaWduIFJvb3QgQ0ExNDAeFw0yMDA0MDgw +NzA2MTlaFw00NTA0MDgwNzA2MTlaMFExCzAJBgNVBAYTAkpQMSMwIQYDVQQKExpD +eWJlcnRydXN0IEphcGFuIENvLiwgTHRkLjEdMBsGA1UEAxMUU2VjdXJlU2lnbiBS +b290IENBMTQwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDF0nqh1oq/ +FjHQmNE6lPxauG4iwWL3pwon71D2LrGeaBLwbCRjOfHw3xDG3rdSINVSW0KZnvOg +vlIfX8xnbacuUKLBl422+JX1sLrcneC+y9/3OPJH9aaakpUqYllQC6KxNedlsmGy +6pJxaeQp8E+BgQQ8sqVb1MWoWWd7VRxJq3qdwudzTe/NCcLEVxLbAQ4jeQkHO6Lo +/IrPj8BGJJw4J+CDnRugv3gVEOuGTgpa/d/aLIJ+7sr2KeH6caH3iGicnPCNvg9J +kdjqOvn90Ghx2+m1K06Ckm9mH+Dw3EzsytHqunQG+bOEkJTRX45zGRBdAuVwpcAQ +0BB8b8VYSbSwbprafZX1zNoCr7gsfXmPvkPx+SgojQlD+Ajda8iLLCSxjVIHvXib +y8posqTdDEx5YMaZ0ZPxMBoH064iwurO8YQJzOAUbn8/ftKChazcqRZOhaBgy/ac +18izju3Gm5h1DVXoX+WViwKkrkMpKBGk5hIwAUt1ax5mnXkvpXYvHUC0bcl9eQjs +0Wq2XSqypWa9a4X0dFbD9ed1Uigspf9mR6XU/v6eVL9lfgHWMI+lNpyiUBzuOIAB +SMbHdPTGrMNASRZhdCyvjG817XsYAFs2PJxQDcqSMxDxJklt33UkN4Ii1+iW/RVL +ApY+B3KVfqs9TC7XyvDf4Fg/LS8EmjijAQIDAQABo0IwQDAPBgNVHRMBAf8EBTAD +AQH/MA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUBpOjCl4oaTeqYR3r6/wtbyPk +86AwDQYJKoZIhvcNAQEMBQADggIBAJaAcgkGfpzMkwQWu6A6jZJOtxEaCnFxEM0E +rX+lRVAQZk5KQaID2RFPeje5S+LGjzJmdSX7684/AykmjbgWHfYfM25I5uj4V7Ib +ed87hwriZLoAymzvftAj63iP/2SbNDefNWWipAA9EiOWWF3KY4fGoweITedpdopT +zfFP7ELyk+OZpDc8h7hi2/DsHzc/N19DzFGdtfCXwreFamgLRB7lUe6TzktuhsHS +DCRZNhqfLJGP4xjblJUK7ZGqDpncllPjYYPGFrojutzdfhrGe0K22VoF3Jpf1d+4 +2kd92jjbrDnVHmtsKheMYc2xbXIBw8MgAGJoFjHVdqqGuw6qnsb58Nn4DSEC5MUo +FlkRudlpcyqSeLiSV5sI8jrlL5WwWLdrIBRtFO8KvH7YVdiI2i/6GaX7i+B/OfVy +K4XELKzvGUWSTLNhB9xNH27SgRNcmvMSZ4PPmz+Ln52kuaiWA3rF7iDeM9ovnhp6 +dB7h7sxaOgTdsxoEqBRjrLdHEoOabPXm6RUVkRqEGQ6UROcSjiVbgGcZ3GOTEAtl +Lor6CZpO2oYofaphNdgOpygau1LgePhsumywbrmHXumZNTfxPWQrqaA0k89jL9WB +365jJ6UeTo3cKXhZ+PmhIIynJkBugnLNeLLIjzwec+fBH7/PzqUqm9tEZDKgu39c +JRNItX+S +-----END CERTIFICATE----- +# CN=SecureSign Root CA15,O=Cybertrust Japan Co.\, Ltd.,C=JP +# e778f0f095fe843729cd1a0082179e5314a9c291442805e1fb1d8fb6b8886c3a +-----BEGIN CERTIFICATE----- +MIICIzCCAamgAwIBAgIUFhXHw9hJp75pDIqI7fBw+d23PocwCgYIKoZIzj0EAwMw +UTELMAkGA1UEBhMCSlAxIzAhBgNVBAoTGkN5YmVydHJ1c3QgSmFwYW4gQ28uLCBM +dGQuMR0wGwYDVQQDExRTZWN1cmVTaWduIFJvb3QgQ0ExNTAeFw0yMDA0MDgwODMy +NTZaFw00NTA0MDgwODMyNTZaMFExCzAJBgNVBAYTAkpQMSMwIQYDVQQKExpDeWJl +cnRydXN0IEphcGFuIENvLiwgTHRkLjEdMBsGA1UEAxMUU2VjdXJlU2lnbiBSb290 +IENBMTUwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAQLUHSNZDKZmbPSYAi4Io5GdCx4 +wCtELW1fHcmuS1Iggz24FG1Th2CeX2yF2wYUleDHKP+dX+Sq8bOLbe1PL0vJSpSR +ZHX+AezB2Ot6lHhWGENfa4HL9rzatAy2KZMIaY+jQjBAMA8GA1UdEwEB/wQFMAMB +Af8wDgYDVR0PAQH/BAQDAgEGMB0GA1UdDgQWBBTrQciu/NWeUUj1vYv0hyCTQSvT +9DAKBggqhkjOPQQDAwNoADBlAjEA2S6Jfl5OpBEHvVnCB96rMjhTKkZEBhd6zlHp +4P9mLQlO4E/0BdGF9jVg3PVys0Z9AjBEmEYagoUeYWmJSwdLZrWeqrqgHkHZAXQ6 +bkU6iYAZezKYVWOr62Nuk22rGwlgMU4= +-----END CERTIFICATE----- # CN=SecureSign RootCA11,O=Japan Certification Services\, Inc.,C=JP # bf0feefb9e3a581ad5f9e9db7589985743d261085c4d314f6f5d7259aa421612 -----BEGIN CERTIFICATE----- @@ -3062,6 +3135,40 @@ WL6ukK2YJ5f+AbGwUgC4TeQbIXQbfsDuXmkqJa9c1h3a0nnJ85cp4IaH3gRZD/FZ e9eiPZaGzPImNC1qkp2aGtAw4l1OBLBfiyB+d8E9lYLRRpo7PHi4b6HQDWSieB4p TpPDpFQUWw== -----END CERTIFICATE----- +# CN=TWCA CYBER Root CA,OU=Root CA,O=TAIWAN-CA,C=TW +# 3f63bb2814be174ec8b6439cf08d6d56f0b7c405883a5648a334424d6b3ec558 +-----BEGIN CERTIFICATE----- +MIIFjTCCA3WgAwIBAgIQQAE0jMIAAAAAAAAAATzyxjANBgkqhkiG9w0BAQwFADBQ +MQswCQYDVQQGEwJUVzESMBAGA1UEChMJVEFJV0FOLUNBMRAwDgYDVQQLEwdSb290 +IENBMRswGQYDVQQDExJUV0NBIENZQkVSIFJvb3QgQ0EwHhcNMjIxMTIyMDY1NDI5 +WhcNNDcxMTIyMTU1OTU5WjBQMQswCQYDVQQGEwJUVzESMBAGA1UEChMJVEFJV0FO +LUNBMRAwDgYDVQQLEwdSb290IENBMRswGQYDVQQDExJUV0NBIENZQkVSIFJvb3Qg +Q0EwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDG+Moe2Qkgfh1sTs6P +40czRJzHyWmqOlt47nDSkvgEs1JSHWdyKKHfi12VCv7qze33Kc7wb3+szT3vsxxF +avcokPFhV8UMxKNQXd7UtcsZyoC5dc4pztKFIuwCY8xEMCDa6pFbVuYdHNWdZsc/ +34bKS1PE2Y2yHer43CdTo0fhYcx9tbD47nORxc5zb87uEB8aBs/pJ2DFTxnk684i +JkXXYJndzk834H/nY62wuFm40AZoNWDTNq5xQwTxaWV4fPMf88oon1oglWa0zbfu +j3ikRRjpJi+NmykosaS3Om251Bw4ckVYsV7r8Cibt4LK/c/WMw+f+5eesRycnupf +Xtuq3VTpMCEobY5583WSjCb+3MX2w7DfRFlDo7YDKPYIMKoNM+HvnKkHIuNZW0CP +2oi3aQiotyMuRAlZN1vH4xfyIutuOVLF3lSnmMlLIJXcRolftBL5hSmO68gnFSDA +S9TMfAxsNAwmmyYxpjyn9tnQS6Jk/zuZQXLB4HCX8SS7K8R0IrGsayIyJNN4KsDA +oS/xUgXJP+92ZuJF2A09rZXIx4kmyA+upwMu+8Ff+iDhcK2wZSA3M2Cw1a/XDBzC +kHDXShi8fgGwsOsVHkQGzaRP6AzRwyAQ4VRlnrZR0Bp2a0JaWHY06rc3Ga4udfmW +5cFZ95RXKSWNOkyrTZpB0F8mAwIDAQABo2MwYTAOBgNVHQ8BAf8EBAMCAQYwDwYD +VR0TAQH/BAUwAwEB/zAfBgNVHSMEGDAWgBSdhWEUfMFib5do5E83QOGt4A1WNzAd +BgNVHQ4EFgQUnYVhFHzBYm+XaORPN0DhreANVjcwDQYJKoZIhvcNAQEMBQADggIB +AGSPesRiDrWIzLjHhg6hShbNcAu3p4ULs3a2D6f/CIsLJc+o1IN1KriWiLb73y0t +tGlTITVX1olNc79pj3CjYcya2x6a4CD4bLubIp1dhDGaLIrdaqHXKGnK/nZVekZn +68xDiBaiA9a5F/gZbG0jAn/xX9AKKSM70aoK7akXJlQKTcKlTfjF/biBzysseKNn +TKkHmvPfXvt89YnNdJdhEGoHK4Fa0o635yDRIG4kqIQnoVesqlVYL9zZyvpoBJ7t +RCT5dEA7IzOrg1oYJkK2bVS1FmAwbLGg+LhBoF1JSdJlBTrq/p1hvIbZv97Tujqx +f36SNI7JAG7cmL3c7IAFrQI932XtCwP39xaEBDG6k5TY8hL4iuO/Qq+n1M0RFxbI +Qh0UqEL20kCGoE8jypZFVmAGzbdVAaYBlGX+bgUJurSkquLvWL69J1bY73NxW0Qz +8ppy6rBePm6pUlvscG21h483XjyMnM7k8M4MZ0HMzvaAq07MTFb1wWFZk7Q+ptq4 +NxKfKjLji7gh7MMrZQzvIt6IKTtM1/r+t+FHvpw+PoP7UV31aPcuIYXcv/Fa4nzX +xeSDwWrruoBa3lwtcHb4yOWHh8qgnaHlIhInD0Q9HWzq1MKLL295q39QpsQZp6F6 +t5b5wR9iWqJDB0BeJsas7a5wFsWqynKKTbDPAYsDP27X +-----END CERTIFICATE----- # CN=TWCA Global Root CA,OU=Root CA,O=TAIWAN-CA,C=TW # 59769007f7685d0fcd50872f9f95d5755a5b2b457d81f3692b610a98672f0e1b -----BEGIN CERTIFICATE----- From 38a0b5da75b2e8dab0b735296f86d9605e5f92cf Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Mon, 22 Jul 2024 16:52:53 -0400 Subject: [PATCH 32/57] argon2: Avo port of blamka_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="argon2/blamka_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I3567eb80ef80dff248225f17470122c0a4e6951e Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600315 Reviewed-by: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker --- argon2/_asm/blamka_amd64.go | 287 ++++ argon2/_asm/go.mod | 15 + argon2/_asm/go.sum | 12 + argon2/blamka_amd64.s | 2972 ++++++++++++++++++++++++++++++++--- 4 files changed, 3074 insertions(+), 212 deletions(-) create mode 100644 argon2/_asm/blamka_amd64.go create mode 100644 argon2/_asm/go.mod create mode 100644 argon2/_asm/go.sum diff --git a/argon2/_asm/blamka_amd64.go b/argon2/_asm/blamka_amd64.go new file mode 100644 index 0000000000..17a1e7629a --- /dev/null +++ b/argon2/_asm/blamka_amd64.go @@ -0,0 +1,287 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/argon2" +) + +//go:generate go run . -out ../blamka_amd64.s -pkg argon2 + +func main() { + Package("golang.org/x/crypto/argon2") + ConstraintExpr("amd64,gc,!purego") + + blamkaSSE4() + mixBlocksSSE2() + xorBlocksSSE2() + Generate() +} + +func blamkaSSE4() { + Implement("blamkaSSE4") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("b"), RAX) + + c40 := c40_DATA() + c48 := c48_DATA() + + MOVOU(c40, X10) + MOVOU(c48, X11) + + BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) + + BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) + RET() +} + +func mixBlocksSSE2() { + Implement("mixBlocksSSE2") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("out"), RDX) + Load(Param("a"), RAX) + Load(Param("b"), RBX) + Load(Param("c"), RCX) + MOVQ(U32(128), RDI) + + Label("loop") + MOVOU(Mem{Base: AX}.Offset(0), X0) + MOVOU(Mem{Base: BX}.Offset(0), X1) + MOVOU(Mem{Base: CX}.Offset(0), X2) + PXOR(X1, X0) + PXOR(X2, X0) + MOVOU(X0, Mem{Base: DX}.Offset(0)) + ADDQ(Imm(16), RAX) + ADDQ(Imm(16), RBX) + ADDQ(Imm(16), RCX) + ADDQ(Imm(16), RDX) + SUBQ(Imm(2), RDI) + JA(LabelRef("loop")) + RET() +} + +func xorBlocksSSE2() { + Implement("xorBlocksSSE2") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("out"), RDX) + Load(Param("a"), RAX) + Load(Param("b"), RBX) + Load(Param("c"), RCX) + MOVQ(U32(128), RDI) + + Label("loop") + MOVOU(Mem{Base: AX}.Offset(0), X0) + MOVOU(Mem{Base: BX}.Offset(0), X1) + MOVOU(Mem{Base: CX}.Offset(0), X2) + MOVOU(Mem{Base: DX}.Offset(0), X3) + PXOR(X1, X0) + PXOR(X2, X0) + PXOR(X3, X0) + MOVOU(X0, Mem{Base: DX}.Offset(0)) + ADDQ(Imm(16), RAX) + ADDQ(Imm(16), RBX) + ADDQ(Imm(16), RCX) + ADDQ(Imm(16), RDX) + SUBQ(Imm(2), RDI) + JA(LabelRef("loop")) + RET() +} + +func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v6, t1) + PUNPCKLQDQ(v6, t2) + PUNPCKHQDQ(v7, v6) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(v7, t2) + MOVO(t1, v7) + MOVO(v2, t1) + PUNPCKHQDQ(t2, v7) + PUNPCKLQDQ(v3, t2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v3) +} + +func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v2, t1) + PUNPCKLQDQ(v2, t2) + PUNPCKHQDQ(v3, v2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(v3, t2) + MOVO(t1, v3) + MOVO(v6, t1) + PUNPCKHQDQ(t2, v3) + PUNPCKLQDQ(v7, t2) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v7) +} + +func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48 VecPhysical) { + MOVO(v0, t0) + PMULULQ(v2, t0) + PADDQ(v2, v0) + PADDQ(t0, v0) + PADDQ(t0, v0) + PXOR(v0, v6) + PSHUFD(Imm(0xB1), v6, v6) + MOVO(v4, t0) + PMULULQ(v6, t0) + PADDQ(v6, v4) + PADDQ(t0, v4) + PADDQ(t0, v4) + PXOR(v4, v2) + PSHUFB(c40, v2) + MOVO(v0, t0) + PMULULQ(v2, t0) + PADDQ(v2, v0) + PADDQ(t0, v0) + PADDQ(t0, v0) + PXOR(v0, v6) + PSHUFB(c48, v6) + MOVO(v4, t0) + PMULULQ(v6, t0) + PADDQ(v6, v4) + PADDQ(t0, v4) + PADDQ(t0, v4) + PXOR(v4, v2) + MOVO(v2, t0) + PADDQ(v2, t0) + PSRLQ(Imm(63), v2) + PXOR(t0, v2) + MOVO(v1, t0) + PMULULQ(v3, t0) + PADDQ(v3, v1) + PADDQ(t0, v1) + PADDQ(t0, v1) + PXOR(v1, v7) + PSHUFD(Imm(0xB1), v7, v7) + MOVO(v5, t0) + PMULULQ(v7, t0) + PADDQ(v7, v5) + PADDQ(t0, v5) + PADDQ(t0, v5) + PXOR(v5, v3) + PSHUFB(c40, v3) + MOVO(v1, t0) + PMULULQ(v3, t0) + PADDQ(v3, v1) + PADDQ(t0, v1) + PADDQ(t0, v1) + PXOR(v1, v7) + PSHUFB(c48, v7) + MOVO(v5, t0) + PMULULQ(v7, t0) + PADDQ(v7, v5) + PADDQ(t0, v5) + PADDQ(t0, v5) + PXOR(v5, v3) + MOVO(v3, t0) + PADDQ(v3, t0) + PSRLQ(Imm(63), v3) + PXOR(t0, v3) +} + +func LOAD_MSG_0(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(Mem{Base: block}.Offset(8*(off+(i*2))), r) + } +} + +func STORE_MSG_0(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(r, Mem{Base: block}.Offset(8*(off+(i*2)))) + } +} + +func LOAD_MSG_1(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(Mem{Base: block}.Offset(8*off+i*16*8), r) + } +} + +func STORE_MSG_1(block GPPhysical, off int) { + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} + for i, r := range registers { + MOVOU(r, Mem{Base: block}.Offset(8*off+i*16*8)) + } +} + +func BLAMKA_ROUND_0(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) { + LOAD_MSG_0(block, off) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1) + STORE_MSG_0(block, off) +} + +func BLAMKA_ROUND_1(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) { + LOAD_MSG_1(block, off) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1) + STORE_MSG_1(block, off) +} + +// ##------------------DATA SECTION-------------------## + +var c40_DATA_ptr, c48_DATA_ptr *Mem + +func c40_DATA() Mem { + if c40_DATA_ptr != nil { + return *c40_DATA_ptr + } + + c40_DATA := GLOBL("·c40", NOPTR|RODATA) + c40_DATA_ptr = &c40_DATA + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + return c40_DATA +} +func c48_DATA() Mem { + if c48_DATA_ptr != nil { + return *c48_DATA_ptr + } + + c48_DATA := GLOBL("·c48", NOPTR|RODATA) + c48_DATA_ptr = &c48_DATA + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + return c48_DATA +} diff --git a/argon2/_asm/go.mod b/argon2/_asm/go.mod new file mode 100644 index 0000000000..41a536dd77 --- /dev/null +++ b/argon2/_asm/go.mod @@ -0,0 +1,15 @@ +module argon2/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/argon2/_asm/go.sum b/argon2/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/argon2/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/argon2/blamka_amd64.s b/argon2/blamka_amd64.s index 6713accac0..c3895478ed 100644 --- a/argon2/blamka_amd64.s +++ b/argon2/blamka_amd64.s @@ -1,243 +1,2791 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blamka_amd64.go -out ../blamka_amd64.s -pkg argon2. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFD $0xB1, v6, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - PSHUFB c40, v2; \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFB c48, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - MOVO v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v7, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - PSHUFB c40, v3; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFB c48, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - MOVO v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG_0(block, off) \ - MOVOU 8*(off+0)(block), X0; \ - MOVOU 8*(off+2)(block), X1; \ - MOVOU 8*(off+4)(block), X2; \ - MOVOU 8*(off+6)(block), X3; \ - MOVOU 8*(off+8)(block), X4; \ - MOVOU 8*(off+10)(block), X5; \ - MOVOU 8*(off+12)(block), X6; \ - MOVOU 8*(off+14)(block), X7 - -#define STORE_MSG_0(block, off) \ - MOVOU X0, 8*(off+0)(block); \ - MOVOU X1, 8*(off+2)(block); \ - MOVOU X2, 8*(off+4)(block); \ - MOVOU X3, 8*(off+6)(block); \ - MOVOU X4, 8*(off+8)(block); \ - MOVOU X5, 8*(off+10)(block); \ - MOVOU X6, 8*(off+12)(block); \ - MOVOU X7, 8*(off+14)(block) - -#define LOAD_MSG_1(block, off) \ - MOVOU 8*off+0*8(block), X0; \ - MOVOU 8*off+16*8(block), X1; \ - MOVOU 8*off+32*8(block), X2; \ - MOVOU 8*off+48*8(block), X3; \ - MOVOU 8*off+64*8(block), X4; \ - MOVOU 8*off+80*8(block), X5; \ - MOVOU 8*off+96*8(block), X6; \ - MOVOU 8*off+112*8(block), X7 - -#define STORE_MSG_1(block, off) \ - MOVOU X0, 8*off+0*8(block); \ - MOVOU X1, 8*off+16*8(block); \ - MOVOU X2, 8*off+32*8(block); \ - MOVOU X3, 8*off+48*8(block); \ - MOVOU X4, 8*off+64*8(block); \ - MOVOU X5, 8*off+80*8(block); \ - MOVOU X6, 8*off+96*8(block); \ - MOVOU X7, 8*off+112*8(block) - -#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ - LOAD_MSG_0(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_0(block, off) - -#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ - LOAD_MSG_1(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_1(block, off) - // func blamkaSSE4(b *block) -TEXT ·blamkaSSE4(SB), 4, $0-8 - MOVQ b+0(FP), AX - - MOVOU ·c40<>(SB), X10 - MOVOU ·c48<>(SB), X11 +// Requires: SSE2, SSSE3 +TEXT ·blamkaSSE4(SB), NOSPLIT, $0-8 + MOVQ b+0(FP), AX + MOVOU ·c40<>+0(SB), X10 + MOVOU ·c48<>+0(SB), X11 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU 128(AX), X0 + MOVOU 144(AX), X1 + MOVOU 160(AX), X2 + MOVOU 176(AX), X3 + MOVOU 192(AX), X4 + MOVOU 208(AX), X5 + MOVOU 224(AX), X6 + MOVOU 240(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 128(AX) + MOVOU X1, 144(AX) + MOVOU X2, 160(AX) + MOVOU X3, 176(AX) + MOVOU X4, 192(AX) + MOVOU X5, 208(AX) + MOVOU X6, 224(AX) + MOVOU X7, 240(AX) + MOVOU 256(AX), X0 + MOVOU 272(AX), X1 + MOVOU 288(AX), X2 + MOVOU 304(AX), X3 + MOVOU 320(AX), X4 + MOVOU 336(AX), X5 + MOVOU 352(AX), X6 + MOVOU 368(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 256(AX) + MOVOU X1, 272(AX) + MOVOU X2, 288(AX) + MOVOU X3, 304(AX) + MOVOU X4, 320(AX) + MOVOU X5, 336(AX) + MOVOU X6, 352(AX) + MOVOU X7, 368(AX) + MOVOU 384(AX), X0 + MOVOU 400(AX), X1 + MOVOU 416(AX), X2 + MOVOU 432(AX), X3 + MOVOU 448(AX), X4 + MOVOU 464(AX), X5 + MOVOU 480(AX), X6 + MOVOU 496(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 384(AX) + MOVOU X1, 400(AX) + MOVOU X2, 416(AX) + MOVOU X3, 432(AX) + MOVOU X4, 448(AX) + MOVOU X5, 464(AX) + MOVOU X6, 480(AX) + MOVOU X7, 496(AX) + MOVOU 512(AX), X0 + MOVOU 528(AX), X1 + MOVOU 544(AX), X2 + MOVOU 560(AX), X3 + MOVOU 576(AX), X4 + MOVOU 592(AX), X5 + MOVOU 608(AX), X6 + MOVOU 624(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 512(AX) + MOVOU X1, 528(AX) + MOVOU X2, 544(AX) + MOVOU X3, 560(AX) + MOVOU X4, 576(AX) + MOVOU X5, 592(AX) + MOVOU X6, 608(AX) + MOVOU X7, 624(AX) + MOVOU 640(AX), X0 + MOVOU 656(AX), X1 + MOVOU 672(AX), X2 + MOVOU 688(AX), X3 + MOVOU 704(AX), X4 + MOVOU 720(AX), X5 + MOVOU 736(AX), X6 + MOVOU 752(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 640(AX) + MOVOU X1, 656(AX) + MOVOU X2, 672(AX) + MOVOU X3, 688(AX) + MOVOU X4, 704(AX) + MOVOU X5, 720(AX) + MOVOU X6, 736(AX) + MOVOU X7, 752(AX) + MOVOU 768(AX), X0 + MOVOU 784(AX), X1 + MOVOU 800(AX), X2 + MOVOU 816(AX), X3 + MOVOU 832(AX), X4 + MOVOU 848(AX), X5 + MOVOU 864(AX), X6 + MOVOU 880(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 768(AX) + MOVOU X1, 784(AX) + MOVOU X2, 800(AX) + MOVOU X3, 816(AX) + MOVOU X4, 832(AX) + MOVOU X5, 848(AX) + MOVOU X6, 864(AX) + MOVOU X7, 880(AX) + MOVOU 896(AX), X0 + MOVOU 912(AX), X1 + MOVOU 928(AX), X2 + MOVOU 944(AX), X3 + MOVOU 960(AX), X4 + MOVOU 976(AX), X5 + MOVOU 992(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 896(AX) + MOVOU X1, 912(AX) + MOVOU X2, 928(AX) + MOVOU X3, 944(AX) + MOVOU X4, 960(AX) + MOVOU X5, 976(AX) + MOVOU X6, 992(AX) + MOVOU X7, 1008(AX) + MOVOU (AX), X0 + MOVOU 128(AX), X1 + MOVOU 256(AX), X2 + MOVOU 384(AX), X3 + MOVOU 512(AX), X4 + MOVOU 640(AX), X5 + MOVOU 768(AX), X6 + MOVOU 896(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 128(AX) + MOVOU X2, 256(AX) + MOVOU X3, 384(AX) + MOVOU X4, 512(AX) + MOVOU X5, 640(AX) + MOVOU X6, 768(AX) + MOVOU X7, 896(AX) + MOVOU 16(AX), X0 + MOVOU 144(AX), X1 + MOVOU 272(AX), X2 + MOVOU 400(AX), X3 + MOVOU 528(AX), X4 + MOVOU 656(AX), X5 + MOVOU 784(AX), X6 + MOVOU 912(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 16(AX) + MOVOU X1, 144(AX) + MOVOU X2, 272(AX) + MOVOU X3, 400(AX) + MOVOU X4, 528(AX) + MOVOU X5, 656(AX) + MOVOU X6, 784(AX) + MOVOU X7, 912(AX) + MOVOU 32(AX), X0 + MOVOU 160(AX), X1 + MOVOU 288(AX), X2 + MOVOU 416(AX), X3 + MOVOU 544(AX), X4 + MOVOU 672(AX), X5 + MOVOU 800(AX), X6 + MOVOU 928(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 32(AX) + MOVOU X1, 160(AX) + MOVOU X2, 288(AX) + MOVOU X3, 416(AX) + MOVOU X4, 544(AX) + MOVOU X5, 672(AX) + MOVOU X6, 800(AX) + MOVOU X7, 928(AX) + MOVOU 48(AX), X0 + MOVOU 176(AX), X1 + MOVOU 304(AX), X2 + MOVOU 432(AX), X3 + MOVOU 560(AX), X4 + MOVOU 688(AX), X5 + MOVOU 816(AX), X6 + MOVOU 944(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 48(AX) + MOVOU X1, 176(AX) + MOVOU X2, 304(AX) + MOVOU X3, 432(AX) + MOVOU X4, 560(AX) + MOVOU X5, 688(AX) + MOVOU X6, 816(AX) + MOVOU X7, 944(AX) + MOVOU 64(AX), X0 + MOVOU 192(AX), X1 + MOVOU 320(AX), X2 + MOVOU 448(AX), X3 + MOVOU 576(AX), X4 + MOVOU 704(AX), X5 + MOVOU 832(AX), X6 + MOVOU 960(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 64(AX) + MOVOU X1, 192(AX) + MOVOU X2, 320(AX) + MOVOU X3, 448(AX) + MOVOU X4, 576(AX) + MOVOU X5, 704(AX) + MOVOU X6, 832(AX) + MOVOU X7, 960(AX) + MOVOU 80(AX), X0 + MOVOU 208(AX), X1 + MOVOU 336(AX), X2 + MOVOU 464(AX), X3 + MOVOU 592(AX), X4 + MOVOU 720(AX), X5 + MOVOU 848(AX), X6 + MOVOU 976(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 80(AX) + MOVOU X1, 208(AX) + MOVOU X2, 336(AX) + MOVOU X3, 464(AX) + MOVOU X4, 592(AX) + MOVOU X5, 720(AX) + MOVOU X6, 848(AX) + MOVOU X7, 976(AX) + MOVOU 96(AX), X0 + MOVOU 224(AX), X1 + MOVOU 352(AX), X2 + MOVOU 480(AX), X3 + MOVOU 608(AX), X4 + MOVOU 736(AX), X5 + MOVOU 864(AX), X6 + MOVOU 992(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 96(AX) + MOVOU X1, 224(AX) + MOVOU X2, 352(AX) + MOVOU X3, 480(AX) + MOVOU X4, 608(AX) + MOVOU X5, 736(AX) + MOVOU X6, 864(AX) + MOVOU X7, 992(AX) + MOVOU 112(AX), X0 + MOVOU 240(AX), X1 + MOVOU 368(AX), X2 + MOVOU 496(AX), X3 + MOVOU 624(AX), X4 + MOVOU 752(AX), X5 + MOVOU 880(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 112(AX) + MOVOU X1, 240(AX) + MOVOU X2, 368(AX) + MOVOU X3, 496(AX) + MOVOU X4, 624(AX) + MOVOU X5, 752(AX) + MOVOU X6, 880(AX) + MOVOU X7, 1008(AX) + RET - BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) - RET +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 -// func mixBlocksSSE2(out, a, b, c *block) -TEXT ·mixBlocksSSE2(SB), 4, $0-32 +// func mixBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·mixBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 PXOR X1, X0 PXOR X2, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET -// func xorBlocksSSE2(out, a, b, c *block) -TEXT ·xorBlocksSSE2(SB), 4, $0-32 +// func xorBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·xorBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 - MOVOU 0(DX), X3 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 + MOVOU (DX), X3 PXOR X1, X0 PXOR X2, X0 PXOR X3, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET From 38ed1bc0ecdc7fbeadbc7337dc44667688eac800 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Thu, 25 Jul 2024 22:25:00 -0400 Subject: [PATCH 33/57] blake2s: port blake2s_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="blake2s/blake2s_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: Ica8bf9f0b42dc93714aa54e783fa74ed19e6b9f4 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601216 Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Filippo Valsorda --- blake2s/_asm/blake2s_amd64_asm.go | 525 ++++++ blake2s/_asm/go.mod | 15 + blake2s/_asm/go.sum | 12 + blake2s/blake2s_amd64.s | 2571 ++++++++++++++++++++++++----- 4 files changed, 2708 insertions(+), 415 deletions(-) create mode 100644 blake2s/_asm/blake2s_amd64_asm.go create mode 100644 blake2s/_asm/go.mod create mode 100644 blake2s/_asm/go.sum diff --git a/blake2s/_asm/blake2s_amd64_asm.go b/blake2s/_asm/blake2s_amd64_asm.go new file mode 100644 index 0000000000..48ddd6118a --- /dev/null +++ b/blake2s/_asm/blake2s_amd64_asm.go @@ -0,0 +1,525 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/blake2s" +) + +//go:generate go run . -out ../blake2s_amd64.s -pkg blake2s + +func main() { + Package("golang.org/x/crypto/blake2s") + ConstraintExpr("amd64,gc,!purego") + hashBlocksSSE2() + hashBlocksSSSE3() + hashBlocksSSE4() + Generate() +} + +func ROTL_SSE2(n uint64, t, v VecPhysical) { + MOVO(v, t) + PSLLL(Imm(n), t) + PSRLL(Imm(32-n), v) + PXOR(t, v) +} + +func ROTL_SSSE3(c, v VecPhysical) { + PSHUFB(c, v) +} + +func ROUND_SSE2(v0, v1, v2, v3 VecPhysical, m0, m1, m2, m3 Mem, t VecPhysical) { + PADDL(m0, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(16, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m1, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(24, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v1, v1) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v3, v3) + PADDL(m2, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(16, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m3, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSE2(24, t, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v3, v3) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v1, v1) +} + +func ROUND_SSSE3(v0, v1, v2, v3 VecPhysical, m0, m1, m2, m3 Op, t, c16, c8 VecPhysical) { + PADDL(m0, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c16, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m1, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c8, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v1, v1) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v3, v3) + PADDL(m2, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c16, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(20, t, v1) + PADDL(m3, v0) + PADDL(v1, v0) + PXOR(v0, v3) + ROTL_SSSE3(c8, v3) + PADDL(v3, v2) + PXOR(v2, v1) + ROTL_SSE2(25, t, v1) + PSHUFL(Imm(0x39), v3, v3) + PSHUFL(Imm(0x4E), v2, v2) + PSHUFL(Imm(0x93), v1, v1) +} + +func LOAD_MSG_SSE4(m0, m1, m2, m3 VecPhysical, src GPPhysical, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 int) { + // Hack to get Avo to emit a MOVL instruction with a VecPhysical as the destination + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i0 * 4), m0}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i1*4), m0) + PINSRD(Imm(2), Mem{Base: src}.Offset(i2*4), m0) + PINSRD(Imm(3), Mem{Base: src}.Offset(i3*4), m0) + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i4 * 4), m1}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i5*4), m1) + PINSRD(Imm(2), Mem{Base: src}.Offset(i6*4), m1) + PINSRD(Imm(3), Mem{Base: src}.Offset(i7*4), m1) + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i8 * 4), m2}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i9*4), m2) + PINSRD(Imm(2), Mem{Base: src}.Offset(i10*4), m2) + PINSRD(Imm(3), Mem{Base: src}.Offset(i11*4), m2) + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{Mem{Base: src}.Offset(i12 * 4), m3}}) + PINSRD(Imm(1), Mem{Base: src}.Offset(i13*4), m3) + PINSRD(Imm(2), Mem{Base: src}.Offset(i14*4), m3) + PINSRD(Imm(3), Mem{Base: src}.Offset(i15*4), m3) +} + +func PRECOMPUTE_MSG(dst GPPhysical, off int, src, R8, R9, R10, R11, R12, R13, R14, R15 GPPhysical) { + MOVQ(Mem{Base: src}.Offset(0*4), R8) + MOVQ(Mem{Base: src}.Offset(2*4), R9) + MOVQ(Mem{Base: src}.Offset(4*4), R10) + MOVQ(Mem{Base: src}.Offset(6*4), R11) + MOVQ(Mem{Base: src}.Offset(8*4), R12) + MOVQ(Mem{Base: src}.Offset(10*4), R13) + MOVQ(Mem{Base: src}.Offset(12*4), R14) + MOVQ(Mem{Base: src}.Offset(14*4), R15) + + MOVL(R8L, Mem{Base: dst}.Offset(0*4+off+0)) + MOVL(R8L, Mem{Base: dst}.Offset(9*4+off+64)) + MOVL(R8L, Mem{Base: dst}.Offset(5*4+off+128)) + MOVL(R8L, Mem{Base: dst}.Offset(14*4+off+192)) + MOVL(R8L, Mem{Base: dst}.Offset(4*4+off+256)) + MOVL(R8L, Mem{Base: dst}.Offset(2*4+off+320)) + MOVL(R8L, Mem{Base: dst}.Offset(8*4+off+384)) + MOVL(R8L, Mem{Base: dst}.Offset(12*4+off+448)) + MOVL(R8L, Mem{Base: dst}.Offset(3*4+off+512)) + MOVL(R8L, Mem{Base: dst}.Offset(15*4+off+576)) + SHRQ(Imm(32), R8) + MOVL(R8L, Mem{Base: dst}.Offset(4*4+off+0)) + MOVL(R8L, Mem{Base: dst}.Offset(8*4+off+64)) + MOVL(R8L, Mem{Base: dst}.Offset(14*4+off+128)) + MOVL(R8L, Mem{Base: dst}.Offset(5*4+off+192)) + MOVL(R8L, Mem{Base: dst}.Offset(12*4+off+256)) + MOVL(R8L, Mem{Base: dst}.Offset(11*4+off+320)) + MOVL(R8L, Mem{Base: dst}.Offset(1*4+off+384)) + MOVL(R8L, Mem{Base: dst}.Offset(6*4+off+448)) + MOVL(R8L, Mem{Base: dst}.Offset(10*4+off+512)) + MOVL(R8L, Mem{Base: dst}.Offset(3*4+off+576)) + + MOVL(R9L, Mem{Base: dst}.Offset(1*4+off+0)) + MOVL(R9L, Mem{Base: dst}.Offset(13*4+off+64)) + MOVL(R9L, Mem{Base: dst}.Offset(6*4+off+128)) + MOVL(R9L, Mem{Base: dst}.Offset(8*4+off+192)) + MOVL(R9L, Mem{Base: dst}.Offset(2*4+off+256)) + MOVL(R9L, Mem{Base: dst}.Offset(0*4+off+320)) + MOVL(R9L, Mem{Base: dst}.Offset(14*4+off+384)) + MOVL(R9L, Mem{Base: dst}.Offset(11*4+off+448)) + MOVL(R9L, Mem{Base: dst}.Offset(12*4+off+512)) + MOVL(R9L, Mem{Base: dst}.Offset(4*4+off+576)) + SHRQ(Imm(32), R9) + MOVL(R9L, Mem{Base: dst}.Offset(5*4+off+0)) + MOVL(R9L, Mem{Base: dst}.Offset(15*4+off+64)) + MOVL(R9L, Mem{Base: dst}.Offset(9*4+off+128)) + MOVL(R9L, Mem{Base: dst}.Offset(1*4+off+192)) + MOVL(R9L, Mem{Base: dst}.Offset(11*4+off+256)) + MOVL(R9L, Mem{Base: dst}.Offset(7*4+off+320)) + MOVL(R9L, Mem{Base: dst}.Offset(13*4+off+384)) + MOVL(R9L, Mem{Base: dst}.Offset(3*4+off+448)) + MOVL(R9L, Mem{Base: dst}.Offset(6*4+off+512)) + MOVL(R9L, Mem{Base: dst}.Offset(10*4+off+576)) + + MOVL(R10L, Mem{Base: dst}.Offset(2*4+off+0)) + MOVL(R10L, Mem{Base: dst}.Offset(1*4+off+64)) + MOVL(R10L, Mem{Base: dst}.Offset(15*4+off+128)) + MOVL(R10L, Mem{Base: dst}.Offset(10*4+off+192)) + MOVL(R10L, Mem{Base: dst}.Offset(6*4+off+256)) + MOVL(R10L, Mem{Base: dst}.Offset(8*4+off+320)) + MOVL(R10L, Mem{Base: dst}.Offset(3*4+off+384)) + MOVL(R10L, Mem{Base: dst}.Offset(13*4+off+448)) + MOVL(R10L, Mem{Base: dst}.Offset(14*4+off+512)) + MOVL(R10L, Mem{Base: dst}.Offset(5*4+off+576)) + SHRQ(Imm(32), R10) + MOVL(R10L, Mem{Base: dst}.Offset(6*4+off+0)) + MOVL(R10L, Mem{Base: dst}.Offset(11*4+off+64)) + MOVL(R10L, Mem{Base: dst}.Offset(2*4+off+128)) + MOVL(R10L, Mem{Base: dst}.Offset(9*4+off+192)) + MOVL(R10L, Mem{Base: dst}.Offset(1*4+off+256)) + MOVL(R10L, Mem{Base: dst}.Offset(13*4+off+320)) + MOVL(R10L, Mem{Base: dst}.Offset(4*4+off+384)) + MOVL(R10L, Mem{Base: dst}.Offset(8*4+off+448)) + MOVL(R10L, Mem{Base: dst}.Offset(15*4+off+512)) + MOVL(R10L, Mem{Base: dst}.Offset(7*4+off+576)) + + MOVL(R11L, Mem{Base: dst}.Offset(3*4+off+0)) + MOVL(R11L, Mem{Base: dst}.Offset(7*4+off+64)) + MOVL(R11L, Mem{Base: dst}.Offset(13*4+off+128)) + MOVL(R11L, Mem{Base: dst}.Offset(12*4+off+192)) + MOVL(R11L, Mem{Base: dst}.Offset(10*4+off+256)) + MOVL(R11L, Mem{Base: dst}.Offset(1*4+off+320)) + MOVL(R11L, Mem{Base: dst}.Offset(9*4+off+384)) + MOVL(R11L, Mem{Base: dst}.Offset(14*4+off+448)) + MOVL(R11L, Mem{Base: dst}.Offset(0*4+off+512)) + MOVL(R11L, Mem{Base: dst}.Offset(6*4+off+576)) + SHRQ(Imm(32), R11) + MOVL(R11L, Mem{Base: dst}.Offset(7*4+off+0)) + MOVL(R11L, Mem{Base: dst}.Offset(14*4+off+64)) + MOVL(R11L, Mem{Base: dst}.Offset(10*4+off+128)) + MOVL(R11L, Mem{Base: dst}.Offset(0*4+off+192)) + MOVL(R11L, Mem{Base: dst}.Offset(5*4+off+256)) + MOVL(R11L, Mem{Base: dst}.Offset(9*4+off+320)) + MOVL(R11L, Mem{Base: dst}.Offset(12*4+off+384)) + MOVL(R11L, Mem{Base: dst}.Offset(1*4+off+448)) + MOVL(R11L, Mem{Base: dst}.Offset(13*4+off+512)) + MOVL(R11L, Mem{Base: dst}.Offset(2*4+off+576)) + + MOVL(R12L, Mem{Base: dst}.Offset(8*4+off+0)) + MOVL(R12L, Mem{Base: dst}.Offset(5*4+off+64)) + MOVL(R12L, Mem{Base: dst}.Offset(4*4+off+128)) + MOVL(R12L, Mem{Base: dst}.Offset(15*4+off+192)) + MOVL(R12L, Mem{Base: dst}.Offset(14*4+off+256)) + MOVL(R12L, Mem{Base: dst}.Offset(3*4+off+320)) + MOVL(R12L, Mem{Base: dst}.Offset(11*4+off+384)) + MOVL(R12L, Mem{Base: dst}.Offset(10*4+off+448)) + MOVL(R12L, Mem{Base: dst}.Offset(7*4+off+512)) + MOVL(R12L, Mem{Base: dst}.Offset(1*4+off+576)) + SHRQ(Imm(32), R12) + MOVL(R12L, Mem{Base: dst}.Offset(12*4+off+0)) + MOVL(R12L, Mem{Base: dst}.Offset(2*4+off+64)) + MOVL(R12L, Mem{Base: dst}.Offset(11*4+off+128)) + MOVL(R12L, Mem{Base: dst}.Offset(4*4+off+192)) + MOVL(R12L, Mem{Base: dst}.Offset(0*4+off+256)) + MOVL(R12L, Mem{Base: dst}.Offset(15*4+off+320)) + MOVL(R12L, Mem{Base: dst}.Offset(10*4+off+384)) + MOVL(R12L, Mem{Base: dst}.Offset(7*4+off+448)) + MOVL(R12L, Mem{Base: dst}.Offset(5*4+off+512)) + MOVL(R12L, Mem{Base: dst}.Offset(9*4+off+576)) + + MOVL(R13L, Mem{Base: dst}.Offset(9*4+off+0)) + MOVL(R13L, Mem{Base: dst}.Offset(4*4+off+64)) + MOVL(R13L, Mem{Base: dst}.Offset(8*4+off+128)) + MOVL(R13L, Mem{Base: dst}.Offset(13*4+off+192)) + MOVL(R13L, Mem{Base: dst}.Offset(3*4+off+256)) + MOVL(R13L, Mem{Base: dst}.Offset(5*4+off+320)) + MOVL(R13L, Mem{Base: dst}.Offset(7*4+off+384)) + MOVL(R13L, Mem{Base: dst}.Offset(15*4+off+448)) + MOVL(R13L, Mem{Base: dst}.Offset(11*4+off+512)) + MOVL(R13L, Mem{Base: dst}.Offset(0*4+off+576)) + SHRQ(Imm(32), R13) + MOVL(R13L, Mem{Base: dst}.Offset(13*4+off+0)) + MOVL(R13L, Mem{Base: dst}.Offset(10*4+off+64)) + MOVL(R13L, Mem{Base: dst}.Offset(0*4+off+128)) + MOVL(R13L, Mem{Base: dst}.Offset(3*4+off+192)) + MOVL(R13L, Mem{Base: dst}.Offset(9*4+off+256)) + MOVL(R13L, Mem{Base: dst}.Offset(6*4+off+320)) + MOVL(R13L, Mem{Base: dst}.Offset(15*4+off+384)) + MOVL(R13L, Mem{Base: dst}.Offset(4*4+off+448)) + MOVL(R13L, Mem{Base: dst}.Offset(2*4+off+512)) + MOVL(R13L, Mem{Base: dst}.Offset(12*4+off+576)) + + MOVL(R14L, Mem{Base: dst}.Offset(10*4+off+0)) + MOVL(R14L, Mem{Base: dst}.Offset(12*4+off+64)) + MOVL(R14L, Mem{Base: dst}.Offset(1*4+off+128)) + MOVL(R14L, Mem{Base: dst}.Offset(6*4+off+192)) + MOVL(R14L, Mem{Base: dst}.Offset(13*4+off+256)) + MOVL(R14L, Mem{Base: dst}.Offset(4*4+off+320)) + MOVL(R14L, Mem{Base: dst}.Offset(0*4+off+384)) + MOVL(R14L, Mem{Base: dst}.Offset(2*4+off+448)) + MOVL(R14L, Mem{Base: dst}.Offset(8*4+off+512)) + MOVL(R14L, Mem{Base: dst}.Offset(14*4+off+576)) + SHRQ(Imm(32), R14) + MOVL(R14L, Mem{Base: dst}.Offset(14*4+off+0)) + MOVL(R14L, Mem{Base: dst}.Offset(3*4+off+64)) + MOVL(R14L, Mem{Base: dst}.Offset(7*4+off+128)) + MOVL(R14L, Mem{Base: dst}.Offset(2*4+off+192)) + MOVL(R14L, Mem{Base: dst}.Offset(15*4+off+256)) + MOVL(R14L, Mem{Base: dst}.Offset(12*4+off+320)) + MOVL(R14L, Mem{Base: dst}.Offset(6*4+off+384)) + MOVL(R14L, Mem{Base: dst}.Offset(0*4+off+448)) + MOVL(R14L, Mem{Base: dst}.Offset(9*4+off+512)) + MOVL(R14L, Mem{Base: dst}.Offset(11*4+off+576)) + + MOVL(R15L, Mem{Base: dst}.Offset(11*4+off+0)) + MOVL(R15L, Mem{Base: dst}.Offset(0*4+off+64)) + MOVL(R15L, Mem{Base: dst}.Offset(12*4+off+128)) + MOVL(R15L, Mem{Base: dst}.Offset(7*4+off+192)) + MOVL(R15L, Mem{Base: dst}.Offset(8*4+off+256)) + MOVL(R15L, Mem{Base: dst}.Offset(14*4+off+320)) + MOVL(R15L, Mem{Base: dst}.Offset(2*4+off+384)) + MOVL(R15L, Mem{Base: dst}.Offset(5*4+off+448)) + MOVL(R15L, Mem{Base: dst}.Offset(1*4+off+512)) + MOVL(R15L, Mem{Base: dst}.Offset(13*4+off+576)) + SHRQ(Imm(32), R15) + MOVL(R15L, Mem{Base: dst}.Offset(15*4+off+0)) + MOVL(R15L, Mem{Base: dst}.Offset(6*4+off+64)) + MOVL(R15L, Mem{Base: dst}.Offset(3*4+off+128)) + MOVL(R15L, Mem{Base: dst}.Offset(11*4+off+192)) + MOVL(R15L, Mem{Base: dst}.Offset(7*4+off+256)) + MOVL(R15L, Mem{Base: dst}.Offset(10*4+off+320)) + MOVL(R15L, Mem{Base: dst}.Offset(5*4+off+384)) + MOVL(R15L, Mem{Base: dst}.Offset(9*4+off+448)) + MOVL(R15L, Mem{Base: dst}.Offset(4*4+off+512)) + MOVL(R15L, Mem{Base: dst}.Offset(8*4+off+576)) +} + +func BLAKE2s_SSE2() { + PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15) + for i := 0; i < 10; i++ { + ROUND_SSE2(X4, X5, X6, X7, Mem{Base: BP}.Offset(16+64*i), Mem{Base: BP}.Offset(32+64*i), Mem{Base: BP}.Offset(48+64*i), Mem{Base: BP}.Offset(64+64*i), X8) + } +} + +func BLAKE2s_SSSE3() { + PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15) + for i := 0; i < 10; i++ { + ROUND_SSSE3(X4, X5, X6, X7, Mem{Base: BP}.Offset(16+64*i), Mem{Base: BP}.Offset(32+64*i), Mem{Base: BP}.Offset(48+64*i), Mem{Base: BP}.Offset(64+64*i), X8, X13, X14) + } +} + +func BLAKE2s_SSE4() { + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0) + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) +} + +func HASH_BLOCKS(h, c, flag, blocks_base, blocks_len Mem, BLAKE2s_FUNC func()) { + MOVQ(h, RAX) + MOVQ(c, RBX) + MOVL(flag, ECX) + MOVQ(blocks_base, RSI) + MOVQ(blocks_len, RDX) + + MOVQ(RSP, RBP) + ADDQ(Imm(15), RBP) + ANDQ(I32(^15), RBP) + + MOVQ(Mem{Base: BX}.Offset(0), R9) + MOVQ(R9, Mem{Base: BP}.Offset(0)) + MOVQ(RCX, Mem{Base: BP}.Offset(8)) + + MOVOU(Mem{Base: AX}.Offset(0), X0) + MOVOU(Mem{Base: AX}.Offset(16), X1) + + iv0 := iv0_DATA() + iv1 := iv1_DATA() + MOVOU(iv0, X2) + MOVOU(iv1, X3) + + counter := counter_DATA() + rol16 := rol16_DATA() + rol8 := rol8_DATA() + MOVOU(counter, X12) + MOVOU(rol16, X13) + MOVOU(rol8, X14) + MOVO(Mem{Base: BP}.Offset(0), X15) + + Label("loop") + MOVO(X0, X4) + MOVO(X1, X5) + MOVO(X2, X6) + MOVO(X3, X7) + + PADDQ(X12, X15) + PXOR(X15, X7) + + BLAKE2s_FUNC() + + PXOR(X4, X0) + PXOR(X5, X1) + PXOR(X6, X0) + PXOR(X7, X1) + + LEAQ(Mem{Base: SI}.Offset(64), RSI) + SUBQ(Imm(64), RDX) + JNE(LabelRef("loop")) + + MOVO(X15, Mem{Base: BP}.Offset(0)) + MOVQ(Mem{Base: BP}.Offset(0), R9) + MOVQ(R9, Mem{Base: BX}.Offset(0)) + + MOVOU(X0, Mem{Base: AX}.Offset(0)) + MOVOU(X1, Mem{Base: AX}.Offset(16)) +} + +func hashBlocksSSE2() { + Implement("hashBlocksSSE2") + Attributes(0) + AllocLocal(672) // frame = 656 + 16 byte alignment + + h := NewParamAddr("h", 0) + c := NewParamAddr("c", 8) + flag := NewParamAddr("flag", 16) + blocks_base := NewParamAddr("blocks_base", 24) + blocks_len := NewParamAddr("blocks_len", 32) + + HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_SSE2) + RET() +} + +func hashBlocksSSSE3() { + Implement("hashBlocksSSSE3") + Attributes(0) + AllocLocal(672) // frame = 656 + 16 byte alignment + + h := NewParamAddr("h", 0) + c := NewParamAddr("c", 8) + flag := NewParamAddr("flag", 16) + blocks_base := NewParamAddr("blocks_base", 24) + blocks_len := NewParamAddr("blocks_len", 32) + + HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_SSSE3) + RET() +} + +func hashBlocksSSE4() { + Implement("hashBlocksSSE4") + Attributes(0) + AllocLocal(32) // frame = 16 + 16 byte alignment + + h := NewParamAddr("h", 0) + c := NewParamAddr("c", 8) + flag := NewParamAddr("flag", 16) + blocks_base := NewParamAddr("blocks_base", 24) + blocks_len := NewParamAddr("blocks_len", 32) + + HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_SSE4) + RET() +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var iv0_DATA_ptr, iv1_DATA_ptr, rol16_DATA_ptr, rol8_DATA_ptr, counter_DATA_ptr *Mem + +func iv0_DATA() Mem { + if iv0_DATA_ptr != nil { + return *iv0_DATA_ptr + } + + iv0_DATA := GLOBL("iv0", NOPTR|RODATA) + iv0_DATA_ptr = &iv0_DATA + DATA(0x00, U32(0x6a09e667)) + DATA(0x04, U32(0xbb67ae85)) + DATA(0x08, U32(0x3c6ef372)) + DATA(0x0c, U32(0xa54ff53a)) + return iv0_DATA +} + +func iv1_DATA() Mem { + if iv1_DATA_ptr != nil { + return *iv1_DATA_ptr + } + + iv1_DATA := GLOBL("iv1", NOPTR|RODATA) + iv1_DATA_ptr = &iv1_DATA + DATA(0x00, U32(0x510e527f)) + DATA(0x04, U32(0x9b05688c)) + DATA(0x08, U32(0x1f83d9ab)) + DATA(0x0c, U32(0x5be0cd19)) + return iv1_DATA +} + +func rol16_DATA() Mem { + if rol16_DATA_ptr != nil { + return *rol16_DATA_ptr + } + + rol16_DATA := GLOBL("rol16", NOPTR|RODATA) + rol16_DATA_ptr = &rol16_DATA + DATA(0x00, U64(0x0504070601000302)) + DATA(0x08, U64(0x0D0C0F0E09080B0A)) + return rol16_DATA +} + +func rol8_DATA() Mem { + if rol8_DATA_ptr != nil { + return *rol8_DATA_ptr + } + + rol8_DATA := GLOBL("rol8", NOPTR|RODATA) + rol8_DATA_ptr = &rol8_DATA + DATA(0x00, U64(0x0407060500030201)) + DATA(0x08, U64(0x0C0F0E0D080B0A09)) + return rol8_DATA +} + +func counter_DATA() Mem { + if counter_DATA_ptr != nil { + return *counter_DATA_ptr + } + + counter_DATA := GLOBL("counter", NOPTR|RODATA) + counter_DATA_ptr = &counter_DATA + DATA(0x00, U64(0x0000000000000040)) + DATA(0x08, U64(0x0000000000000000)) + return counter_DATA +} diff --git a/blake2s/_asm/go.mod b/blake2s/_asm/go.mod new file mode 100644 index 0000000000..9bb23e0eb1 --- /dev/null +++ b/blake2s/_asm/go.mod @@ -0,0 +1,15 @@ +module blake2s/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/blake2s/_asm/go.sum b/blake2s/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/blake2s/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/blake2s/blake2s_amd64.s b/blake2s/blake2s_amd64.s index fe4b818a33..57d510fc08 100644 --- a/blake2s/blake2s_amd64.s +++ b/blake2s/blake2s_amd64.s @@ -1,432 +1,2173 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2s_amd64_asm.go -out ../blake2s_amd64.s -pkg blake2s. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA iv0<>+0x00(SB)/4, $0x6a09e667 -DATA iv0<>+0x04(SB)/4, $0xbb67ae85 -DATA iv0<>+0x08(SB)/4, $0x3c6ef372 -DATA iv0<>+0x0c(SB)/4, $0xa54ff53a -GLOBL iv0<>(SB), (NOPTR+RODATA), $16 - -DATA iv1<>+0x00(SB)/4, $0x510e527f -DATA iv1<>+0x04(SB)/4, $0x9b05688c -DATA iv1<>+0x08(SB)/4, $0x1f83d9ab -DATA iv1<>+0x0c(SB)/4, $0x5be0cd19 -GLOBL iv1<>(SB), (NOPTR+RODATA), $16 - -DATA rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL rol16<>(SB), (NOPTR+RODATA), $16 - -DATA rol8<>+0x00(SB)/8, $0x0407060500030201 -DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL rol8<>(SB), (NOPTR+RODATA), $16 - -DATA counter<>+0x00(SB)/8, $0x40 -DATA counter<>+0x08(SB)/8, $0x0 -GLOBL counter<>(SB), (NOPTR+RODATA), $16 - -#define ROTL_SSE2(n, t, v) \ - MOVO v, t; \ - PSLLL $n, t; \ - PSRLL $(32-n), v; \ - PXOR t, v - -#define ROTL_SSSE3(c, v) \ - PSHUFB c, v - -#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \ - PADDL m0, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(16, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m1, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(24, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v1, v1; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v3, v3; \ - PADDL m2, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(16, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m3, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(24, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v3, v3; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v1, v1 - -#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \ - PADDL m0, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c16, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m1, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c8, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v1, v1; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v3, v3; \ - PADDL m2, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c16, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m3, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c8, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v3, v3; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v1, v1 - - -#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \ - MOVL i0*4(src), m0; \ - PINSRD $1, i1*4(src), m0; \ - PINSRD $2, i2*4(src), m0; \ - PINSRD $3, i3*4(src), m0; \ - MOVL i4*4(src), m1; \ - PINSRD $1, i5*4(src), m1; \ - PINSRD $2, i6*4(src), m1; \ - PINSRD $3, i7*4(src), m1; \ - MOVL i8*4(src), m2; \ - PINSRD $1, i9*4(src), m2; \ - PINSRD $2, i10*4(src), m2; \ - PINSRD $3, i11*4(src), m2; \ - MOVL i12*4(src), m3; \ - PINSRD $1, i13*4(src), m3; \ - PINSRD $2, i14*4(src), m3; \ - PINSRD $3, i15*4(src), m3 +// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +// Requires: SSE2 +TEXT ·hashBlocksSSE2(SB), $672-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 -#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \ - MOVQ 0*4(src), R8; \ - MOVQ 2*4(src), R9; \ - MOVQ 4*4(src), R10; \ - MOVQ 6*4(src), R11; \ - MOVQ 8*4(src), R12; \ - MOVQ 10*4(src), R13; \ - MOVQ 12*4(src), R14; \ - MOVQ 14*4(src), R15; \ - \ - MOVL R8, 0*4+off+0(dst); \ - MOVL R8, 9*4+off+64(dst); \ - MOVL R8, 5*4+off+128(dst); \ - MOVL R8, 14*4+off+192(dst); \ - MOVL R8, 4*4+off+256(dst); \ - MOVL R8, 2*4+off+320(dst); \ - MOVL R8, 8*4+off+384(dst); \ - MOVL R8, 12*4+off+448(dst); \ - MOVL R8, 3*4+off+512(dst); \ - MOVL R8, 15*4+off+576(dst); \ - SHRQ $32, R8; \ - MOVL R8, 4*4+off+0(dst); \ - MOVL R8, 8*4+off+64(dst); \ - MOVL R8, 14*4+off+128(dst); \ - MOVL R8, 5*4+off+192(dst); \ - MOVL R8, 12*4+off+256(dst); \ - MOVL R8, 11*4+off+320(dst); \ - MOVL R8, 1*4+off+384(dst); \ - MOVL R8, 6*4+off+448(dst); \ - MOVL R8, 10*4+off+512(dst); \ - MOVL R8, 3*4+off+576(dst); \ - \ - MOVL R9, 1*4+off+0(dst); \ - MOVL R9, 13*4+off+64(dst); \ - MOVL R9, 6*4+off+128(dst); \ - MOVL R9, 8*4+off+192(dst); \ - MOVL R9, 2*4+off+256(dst); \ - MOVL R9, 0*4+off+320(dst); \ - MOVL R9, 14*4+off+384(dst); \ - MOVL R9, 11*4+off+448(dst); \ - MOVL R9, 12*4+off+512(dst); \ - MOVL R9, 4*4+off+576(dst); \ - SHRQ $32, R9; \ - MOVL R9, 5*4+off+0(dst); \ - MOVL R9, 15*4+off+64(dst); \ - MOVL R9, 9*4+off+128(dst); \ - MOVL R9, 1*4+off+192(dst); \ - MOVL R9, 11*4+off+256(dst); \ - MOVL R9, 7*4+off+320(dst); \ - MOVL R9, 13*4+off+384(dst); \ - MOVL R9, 3*4+off+448(dst); \ - MOVL R9, 6*4+off+512(dst); \ - MOVL R9, 10*4+off+576(dst); \ - \ - MOVL R10, 2*4+off+0(dst); \ - MOVL R10, 1*4+off+64(dst); \ - MOVL R10, 15*4+off+128(dst); \ - MOVL R10, 10*4+off+192(dst); \ - MOVL R10, 6*4+off+256(dst); \ - MOVL R10, 8*4+off+320(dst); \ - MOVL R10, 3*4+off+384(dst); \ - MOVL R10, 13*4+off+448(dst); \ - MOVL R10, 14*4+off+512(dst); \ - MOVL R10, 5*4+off+576(dst); \ - SHRQ $32, R10; \ - MOVL R10, 6*4+off+0(dst); \ - MOVL R10, 11*4+off+64(dst); \ - MOVL R10, 2*4+off+128(dst); \ - MOVL R10, 9*4+off+192(dst); \ - MOVL R10, 1*4+off+256(dst); \ - MOVL R10, 13*4+off+320(dst); \ - MOVL R10, 4*4+off+384(dst); \ - MOVL R10, 8*4+off+448(dst); \ - MOVL R10, 15*4+off+512(dst); \ - MOVL R10, 7*4+off+576(dst); \ - \ - MOVL R11, 3*4+off+0(dst); \ - MOVL R11, 7*4+off+64(dst); \ - MOVL R11, 13*4+off+128(dst); \ - MOVL R11, 12*4+off+192(dst); \ - MOVL R11, 10*4+off+256(dst); \ - MOVL R11, 1*4+off+320(dst); \ - MOVL R11, 9*4+off+384(dst); \ - MOVL R11, 14*4+off+448(dst); \ - MOVL R11, 0*4+off+512(dst); \ - MOVL R11, 6*4+off+576(dst); \ - SHRQ $32, R11; \ - MOVL R11, 7*4+off+0(dst); \ - MOVL R11, 14*4+off+64(dst); \ - MOVL R11, 10*4+off+128(dst); \ - MOVL R11, 0*4+off+192(dst); \ - MOVL R11, 5*4+off+256(dst); \ - MOVL R11, 9*4+off+320(dst); \ - MOVL R11, 12*4+off+384(dst); \ - MOVL R11, 1*4+off+448(dst); \ - MOVL R11, 13*4+off+512(dst); \ - MOVL R11, 2*4+off+576(dst); \ - \ - MOVL R12, 8*4+off+0(dst); \ - MOVL R12, 5*4+off+64(dst); \ - MOVL R12, 4*4+off+128(dst); \ - MOVL R12, 15*4+off+192(dst); \ - MOVL R12, 14*4+off+256(dst); \ - MOVL R12, 3*4+off+320(dst); \ - MOVL R12, 11*4+off+384(dst); \ - MOVL R12, 10*4+off+448(dst); \ - MOVL R12, 7*4+off+512(dst); \ - MOVL R12, 1*4+off+576(dst); \ - SHRQ $32, R12; \ - MOVL R12, 12*4+off+0(dst); \ - MOVL R12, 2*4+off+64(dst); \ - MOVL R12, 11*4+off+128(dst); \ - MOVL R12, 4*4+off+192(dst); \ - MOVL R12, 0*4+off+256(dst); \ - MOVL R12, 15*4+off+320(dst); \ - MOVL R12, 10*4+off+384(dst); \ - MOVL R12, 7*4+off+448(dst); \ - MOVL R12, 5*4+off+512(dst); \ - MOVL R12, 9*4+off+576(dst); \ - \ - MOVL R13, 9*4+off+0(dst); \ - MOVL R13, 4*4+off+64(dst); \ - MOVL R13, 8*4+off+128(dst); \ - MOVL R13, 13*4+off+192(dst); \ - MOVL R13, 3*4+off+256(dst); \ - MOVL R13, 5*4+off+320(dst); \ - MOVL R13, 7*4+off+384(dst); \ - MOVL R13, 15*4+off+448(dst); \ - MOVL R13, 11*4+off+512(dst); \ - MOVL R13, 0*4+off+576(dst); \ - SHRQ $32, R13; \ - MOVL R13, 13*4+off+0(dst); \ - MOVL R13, 10*4+off+64(dst); \ - MOVL R13, 0*4+off+128(dst); \ - MOVL R13, 3*4+off+192(dst); \ - MOVL R13, 9*4+off+256(dst); \ - MOVL R13, 6*4+off+320(dst); \ - MOVL R13, 15*4+off+384(dst); \ - MOVL R13, 4*4+off+448(dst); \ - MOVL R13, 2*4+off+512(dst); \ - MOVL R13, 12*4+off+576(dst); \ - \ - MOVL R14, 10*4+off+0(dst); \ - MOVL R14, 12*4+off+64(dst); \ - MOVL R14, 1*4+off+128(dst); \ - MOVL R14, 6*4+off+192(dst); \ - MOVL R14, 13*4+off+256(dst); \ - MOVL R14, 4*4+off+320(dst); \ - MOVL R14, 0*4+off+384(dst); \ - MOVL R14, 2*4+off+448(dst); \ - MOVL R14, 8*4+off+512(dst); \ - MOVL R14, 14*4+off+576(dst); \ - SHRQ $32, R14; \ - MOVL R14, 14*4+off+0(dst); \ - MOVL R14, 3*4+off+64(dst); \ - MOVL R14, 7*4+off+128(dst); \ - MOVL R14, 2*4+off+192(dst); \ - MOVL R14, 15*4+off+256(dst); \ - MOVL R14, 12*4+off+320(dst); \ - MOVL R14, 6*4+off+384(dst); \ - MOVL R14, 0*4+off+448(dst); \ - MOVL R14, 9*4+off+512(dst); \ - MOVL R14, 11*4+off+576(dst); \ - \ - MOVL R15, 11*4+off+0(dst); \ - MOVL R15, 0*4+off+64(dst); \ - MOVL R15, 12*4+off+128(dst); \ - MOVL R15, 7*4+off+192(dst); \ - MOVL R15, 8*4+off+256(dst); \ - MOVL R15, 14*4+off+320(dst); \ - MOVL R15, 2*4+off+384(dst); \ - MOVL R15, 5*4+off+448(dst); \ - MOVL R15, 1*4+off+512(dst); \ - MOVL R15, 13*4+off+576(dst); \ - SHRQ $32, R15; \ - MOVL R15, 15*4+off+0(dst); \ - MOVL R15, 6*4+off+64(dst); \ - MOVL R15, 3*4+off+128(dst); \ - MOVL R15, 11*4+off+192(dst); \ - MOVL R15, 7*4+off+256(dst); \ - MOVL R15, 10*4+off+320(dst); \ - MOVL R15, 5*4+off+384(dst); \ - MOVL R15, 9*4+off+448(dst); \ - MOVL R15, 4*4+off+512(dst); \ - MOVL R15, 8*4+off+576(dst) +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + MOVQ 32(SI), R12 + MOVQ 40(SI), R13 + MOVQ 48(SI), R14 + MOVQ 56(SI), R15 + MOVL R8, 16(BP) + MOVL R8, 116(BP) + MOVL R8, 164(BP) + MOVL R8, 264(BP) + MOVL R8, 288(BP) + MOVL R8, 344(BP) + MOVL R8, 432(BP) + MOVL R8, 512(BP) + MOVL R8, 540(BP) + MOVL R8, 652(BP) + SHRQ $0x20, R8 + MOVL R8, 32(BP) + MOVL R8, 112(BP) + MOVL R8, 200(BP) + MOVL R8, 228(BP) + MOVL R8, 320(BP) + MOVL R8, 380(BP) + MOVL R8, 404(BP) + MOVL R8, 488(BP) + MOVL R8, 568(BP) + MOVL R8, 604(BP) + MOVL R9, 20(BP) + MOVL R9, 132(BP) + MOVL R9, 168(BP) + MOVL R9, 240(BP) + MOVL R9, 280(BP) + MOVL R9, 336(BP) + MOVL R9, 456(BP) + MOVL R9, 508(BP) + MOVL R9, 576(BP) + MOVL R9, 608(BP) + SHRQ $0x20, R9 + MOVL R9, 36(BP) + MOVL R9, 140(BP) + MOVL R9, 180(BP) + MOVL R9, 212(BP) + MOVL R9, 316(BP) + MOVL R9, 364(BP) + MOVL R9, 452(BP) + MOVL R9, 476(BP) + MOVL R9, 552(BP) + MOVL R9, 632(BP) + MOVL R10, 24(BP) + MOVL R10, 84(BP) + MOVL R10, 204(BP) + MOVL R10, 248(BP) + MOVL R10, 296(BP) + MOVL R10, 368(BP) + MOVL R10, 412(BP) + MOVL R10, 516(BP) + MOVL R10, 584(BP) + MOVL R10, 612(BP) + SHRQ $0x20, R10 + MOVL R10, 40(BP) + MOVL R10, 124(BP) + MOVL R10, 152(BP) + MOVL R10, 244(BP) + MOVL R10, 276(BP) + MOVL R10, 388(BP) + MOVL R10, 416(BP) + MOVL R10, 496(BP) + MOVL R10, 588(BP) + MOVL R10, 620(BP) + MOVL R11, 28(BP) + MOVL R11, 108(BP) + MOVL R11, 196(BP) + MOVL R11, 256(BP) + MOVL R11, 312(BP) + MOVL R11, 340(BP) + MOVL R11, 436(BP) + MOVL R11, 520(BP) + MOVL R11, 528(BP) + MOVL R11, 616(BP) + SHRQ $0x20, R11 + MOVL R11, 44(BP) + MOVL R11, 136(BP) + MOVL R11, 184(BP) + MOVL R11, 208(BP) + MOVL R11, 292(BP) + MOVL R11, 372(BP) + MOVL R11, 448(BP) + MOVL R11, 468(BP) + MOVL R11, 580(BP) + MOVL R11, 600(BP) + MOVL R12, 48(BP) + MOVL R12, 100(BP) + MOVL R12, 160(BP) + MOVL R12, 268(BP) + MOVL R12, 328(BP) + MOVL R12, 348(BP) + MOVL R12, 444(BP) + MOVL R12, 504(BP) + MOVL R12, 556(BP) + MOVL R12, 596(BP) + SHRQ $0x20, R12 + MOVL R12, 64(BP) + MOVL R12, 88(BP) + MOVL R12, 188(BP) + MOVL R12, 224(BP) + MOVL R12, 272(BP) + MOVL R12, 396(BP) + MOVL R12, 440(BP) + MOVL R12, 492(BP) + MOVL R12, 548(BP) + MOVL R12, 628(BP) + MOVL R13, 52(BP) + MOVL R13, 96(BP) + MOVL R13, 176(BP) + MOVL R13, 260(BP) + MOVL R13, 284(BP) + MOVL R13, 356(BP) + MOVL R13, 428(BP) + MOVL R13, 524(BP) + MOVL R13, 572(BP) + MOVL R13, 592(BP) + SHRQ $0x20, R13 + MOVL R13, 68(BP) + MOVL R13, 120(BP) + MOVL R13, 144(BP) + MOVL R13, 220(BP) + MOVL R13, 308(BP) + MOVL R13, 360(BP) + MOVL R13, 460(BP) + MOVL R13, 480(BP) + MOVL R13, 536(BP) + MOVL R13, 640(BP) + MOVL R14, 56(BP) + MOVL R14, 128(BP) + MOVL R14, 148(BP) + MOVL R14, 232(BP) + MOVL R14, 324(BP) + MOVL R14, 352(BP) + MOVL R14, 400(BP) + MOVL R14, 472(BP) + MOVL R14, 560(BP) + MOVL R14, 648(BP) + SHRQ $0x20, R14 + MOVL R14, 72(BP) + MOVL R14, 92(BP) + MOVL R14, 172(BP) + MOVL R14, 216(BP) + MOVL R14, 332(BP) + MOVL R14, 384(BP) + MOVL R14, 424(BP) + MOVL R14, 464(BP) + MOVL R14, 564(BP) + MOVL R14, 636(BP) + MOVL R15, 60(BP) + MOVL R15, 80(BP) + MOVL R15, 192(BP) + MOVL R15, 236(BP) + MOVL R15, 304(BP) + MOVL R15, 392(BP) + MOVL R15, 408(BP) + MOVL R15, 484(BP) + MOVL R15, 532(BP) + MOVL R15, 644(BP) + SHRQ $0x20, R15 + MOVL R15, 76(BP) + MOVL R15, 104(BP) + MOVL R15, 156(BP) + MOVL R15, 252(BP) + MOVL R15, 300(BP) + MOVL R15, 376(BP) + MOVL R15, 420(BP) + MOVL R15, 500(BP) + MOVL R15, 544(BP) + MOVL R15, 624(BP) + PADDL 16(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 32(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 48(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 64(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 80(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 96(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 112(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 128(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 144(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 160(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 176(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 192(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 208(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 224(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 240(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 256(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 272(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 288(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 304(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 320(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 336(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 352(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 368(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 384(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 400(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 416(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 432(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 448(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 464(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 480(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 496(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 512(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 528(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 544(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 560(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 576(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 592(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 608(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 624(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 640(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) + RET -#define BLAKE2s_SSE2() \ - PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ - ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8) +DATA iv0<>+0(SB)/4, $0x6a09e667 +DATA iv0<>+4(SB)/4, $0xbb67ae85 +DATA iv0<>+8(SB)/4, $0x3c6ef372 +DATA iv0<>+12(SB)/4, $0xa54ff53a +GLOBL iv0<>(SB), RODATA|NOPTR, $16 -#define BLAKE2s_SSSE3() \ - PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ - ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14) +DATA iv1<>+0(SB)/4, $0x510e527f +DATA iv1<>+4(SB)/4, $0x9b05688c +DATA iv1<>+8(SB)/4, $0x1f83d9ab +DATA iv1<>+12(SB)/4, $0x5be0cd19 +GLOBL iv1<>(SB), RODATA|NOPTR, $16 -#define BLAKE2s_SSE4() \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) +DATA counter<>+0(SB)/8, $0x0000000000000040 +DATA counter<>+8(SB)/8, $0x0000000000000000 +GLOBL counter<>(SB), RODATA|NOPTR, $16 -#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \ - MOVQ h, AX; \ - MOVQ c, BX; \ - MOVL flag, CX; \ - MOVQ blocks_base, SI; \ - MOVQ blocks_len, DX; \ - \ - MOVQ SP, BP; \ - ADDQ $15, BP; \ - ANDQ $~15, BP; \ - \ - MOVQ 0(BX), R9; \ - MOVQ R9, 0(BP); \ - MOVQ CX, 8(BP); \ - \ - MOVOU 0(AX), X0; \ - MOVOU 16(AX), X1; \ - MOVOU iv0<>(SB), X2; \ - MOVOU iv1<>(SB), X3 \ - \ - MOVOU counter<>(SB), X12; \ - MOVOU rol16<>(SB), X13; \ - MOVOU rol8<>(SB), X14; \ - MOVO 0(BP), X15; \ - \ - loop: \ - MOVO X0, X4; \ - MOVO X1, X5; \ - MOVO X2, X6; \ - MOVO X3, X7; \ - \ - PADDQ X12, X15; \ - PXOR X15, X7; \ - \ - BLAKE2s_FUNC(); \ - \ - PXOR X4, X0; \ - PXOR X5, X1; \ - PXOR X6, X0; \ - PXOR X7, X1; \ - \ - LEAQ 64(SI), SI; \ - SUBQ $64, DX; \ - JNE loop; \ - \ - MOVO X15, 0(BP); \ - MOVQ 0(BP), R9; \ - MOVQ R9, 0(BX); \ - \ - MOVOU X0, 0(AX); \ - MOVOU X1, 16(AX) +DATA rol16<>+0(SB)/8, $0x0504070601000302 +DATA rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +GLOBL rol16<>(SB), RODATA|NOPTR, $16 -// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2) - RET +DATA rol8<>+0(SB)/8, $0x0407060500030201 +DATA rol8<>+8(SB)/8, $0x0c0f0e0d080b0a09 +GLOBL rol8<>(SB), RODATA|NOPTR, $16 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3) +// Requires: SSE2, SSSE3 +TEXT ·hashBlocksSSSE3(SB), $672-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 + +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + MOVQ 32(SI), R12 + MOVQ 40(SI), R13 + MOVQ 48(SI), R14 + MOVQ 56(SI), R15 + MOVL R8, 16(BP) + MOVL R8, 116(BP) + MOVL R8, 164(BP) + MOVL R8, 264(BP) + MOVL R8, 288(BP) + MOVL R8, 344(BP) + MOVL R8, 432(BP) + MOVL R8, 512(BP) + MOVL R8, 540(BP) + MOVL R8, 652(BP) + SHRQ $0x20, R8 + MOVL R8, 32(BP) + MOVL R8, 112(BP) + MOVL R8, 200(BP) + MOVL R8, 228(BP) + MOVL R8, 320(BP) + MOVL R8, 380(BP) + MOVL R8, 404(BP) + MOVL R8, 488(BP) + MOVL R8, 568(BP) + MOVL R8, 604(BP) + MOVL R9, 20(BP) + MOVL R9, 132(BP) + MOVL R9, 168(BP) + MOVL R9, 240(BP) + MOVL R9, 280(BP) + MOVL R9, 336(BP) + MOVL R9, 456(BP) + MOVL R9, 508(BP) + MOVL R9, 576(BP) + MOVL R9, 608(BP) + SHRQ $0x20, R9 + MOVL R9, 36(BP) + MOVL R9, 140(BP) + MOVL R9, 180(BP) + MOVL R9, 212(BP) + MOVL R9, 316(BP) + MOVL R9, 364(BP) + MOVL R9, 452(BP) + MOVL R9, 476(BP) + MOVL R9, 552(BP) + MOVL R9, 632(BP) + MOVL R10, 24(BP) + MOVL R10, 84(BP) + MOVL R10, 204(BP) + MOVL R10, 248(BP) + MOVL R10, 296(BP) + MOVL R10, 368(BP) + MOVL R10, 412(BP) + MOVL R10, 516(BP) + MOVL R10, 584(BP) + MOVL R10, 612(BP) + SHRQ $0x20, R10 + MOVL R10, 40(BP) + MOVL R10, 124(BP) + MOVL R10, 152(BP) + MOVL R10, 244(BP) + MOVL R10, 276(BP) + MOVL R10, 388(BP) + MOVL R10, 416(BP) + MOVL R10, 496(BP) + MOVL R10, 588(BP) + MOVL R10, 620(BP) + MOVL R11, 28(BP) + MOVL R11, 108(BP) + MOVL R11, 196(BP) + MOVL R11, 256(BP) + MOVL R11, 312(BP) + MOVL R11, 340(BP) + MOVL R11, 436(BP) + MOVL R11, 520(BP) + MOVL R11, 528(BP) + MOVL R11, 616(BP) + SHRQ $0x20, R11 + MOVL R11, 44(BP) + MOVL R11, 136(BP) + MOVL R11, 184(BP) + MOVL R11, 208(BP) + MOVL R11, 292(BP) + MOVL R11, 372(BP) + MOVL R11, 448(BP) + MOVL R11, 468(BP) + MOVL R11, 580(BP) + MOVL R11, 600(BP) + MOVL R12, 48(BP) + MOVL R12, 100(BP) + MOVL R12, 160(BP) + MOVL R12, 268(BP) + MOVL R12, 328(BP) + MOVL R12, 348(BP) + MOVL R12, 444(BP) + MOVL R12, 504(BP) + MOVL R12, 556(BP) + MOVL R12, 596(BP) + SHRQ $0x20, R12 + MOVL R12, 64(BP) + MOVL R12, 88(BP) + MOVL R12, 188(BP) + MOVL R12, 224(BP) + MOVL R12, 272(BP) + MOVL R12, 396(BP) + MOVL R12, 440(BP) + MOVL R12, 492(BP) + MOVL R12, 548(BP) + MOVL R12, 628(BP) + MOVL R13, 52(BP) + MOVL R13, 96(BP) + MOVL R13, 176(BP) + MOVL R13, 260(BP) + MOVL R13, 284(BP) + MOVL R13, 356(BP) + MOVL R13, 428(BP) + MOVL R13, 524(BP) + MOVL R13, 572(BP) + MOVL R13, 592(BP) + SHRQ $0x20, R13 + MOVL R13, 68(BP) + MOVL R13, 120(BP) + MOVL R13, 144(BP) + MOVL R13, 220(BP) + MOVL R13, 308(BP) + MOVL R13, 360(BP) + MOVL R13, 460(BP) + MOVL R13, 480(BP) + MOVL R13, 536(BP) + MOVL R13, 640(BP) + MOVL R14, 56(BP) + MOVL R14, 128(BP) + MOVL R14, 148(BP) + MOVL R14, 232(BP) + MOVL R14, 324(BP) + MOVL R14, 352(BP) + MOVL R14, 400(BP) + MOVL R14, 472(BP) + MOVL R14, 560(BP) + MOVL R14, 648(BP) + SHRQ $0x20, R14 + MOVL R14, 72(BP) + MOVL R14, 92(BP) + MOVL R14, 172(BP) + MOVL R14, 216(BP) + MOVL R14, 332(BP) + MOVL R14, 384(BP) + MOVL R14, 424(BP) + MOVL R14, 464(BP) + MOVL R14, 564(BP) + MOVL R14, 636(BP) + MOVL R15, 60(BP) + MOVL R15, 80(BP) + MOVL R15, 192(BP) + MOVL R15, 236(BP) + MOVL R15, 304(BP) + MOVL R15, 392(BP) + MOVL R15, 408(BP) + MOVL R15, 484(BP) + MOVL R15, 532(BP) + MOVL R15, 644(BP) + SHRQ $0x20, R15 + MOVL R15, 76(BP) + MOVL R15, 104(BP) + MOVL R15, 156(BP) + MOVL R15, 252(BP) + MOVL R15, 300(BP) + MOVL R15, 376(BP) + MOVL R15, 420(BP) + MOVL R15, 500(BP) + MOVL R15, 544(BP) + MOVL R15, 624(BP) + PADDL 16(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 32(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 48(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 64(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 80(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 96(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 112(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 128(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 144(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 160(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 176(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 192(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 208(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 224(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 240(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 256(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 272(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 288(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 304(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 320(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 336(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 352(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 368(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 384(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 400(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 416(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 432(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 448(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 464(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 480(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 496(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 512(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 528(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 544(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 560(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 576(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 592(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 608(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 624(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 640(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) RET // func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4) +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), $32-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 + +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVL (SI), X8 + PINSRD $0x01, 8(SI), X8 + PINSRD $0x02, 16(SI), X8 + PINSRD $0x03, 24(SI), X8 + MOVL 4(SI), X9 + PINSRD $0x01, 12(SI), X9 + PINSRD $0x02, 20(SI), X9 + PINSRD $0x03, 28(SI), X9 + MOVL 32(SI), X10 + PINSRD $0x01, 40(SI), X10 + PINSRD $0x02, 48(SI), X10 + PINSRD $0x03, 56(SI), X10 + MOVL 36(SI), X11 + PINSRD $0x01, 44(SI), X11 + PINSRD $0x02, 52(SI), X11 + PINSRD $0x03, 60(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 56(SI), X8 + PINSRD $0x01, 16(SI), X8 + PINSRD $0x02, 36(SI), X8 + PINSRD $0x03, 52(SI), X8 + MOVL 40(SI), X9 + PINSRD $0x01, 32(SI), X9 + PINSRD $0x02, 60(SI), X9 + PINSRD $0x03, 24(SI), X9 + MOVL 4(SI), X10 + PINSRD $0x01, (SI), X10 + PINSRD $0x02, 44(SI), X10 + PINSRD $0x03, 20(SI), X10 + MOVL 48(SI), X11 + PINSRD $0x01, 8(SI), X11 + PINSRD $0x02, 28(SI), X11 + PINSRD $0x03, 12(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 44(SI), X8 + PINSRD $0x01, 48(SI), X8 + PINSRD $0x02, 20(SI), X8 + PINSRD $0x03, 60(SI), X8 + MOVL 32(SI), X9 + PINSRD $0x01, (SI), X9 + PINSRD $0x02, 8(SI), X9 + PINSRD $0x03, 52(SI), X9 + MOVL 40(SI), X10 + PINSRD $0x01, 12(SI), X10 + PINSRD $0x02, 28(SI), X10 + PINSRD $0x03, 36(SI), X10 + MOVL 56(SI), X11 + PINSRD $0x01, 24(SI), X11 + PINSRD $0x02, 4(SI), X11 + PINSRD $0x03, 16(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 28(SI), X8 + PINSRD $0x01, 12(SI), X8 + PINSRD $0x02, 52(SI), X8 + PINSRD $0x03, 44(SI), X8 + MOVL 36(SI), X9 + PINSRD $0x01, 4(SI), X9 + PINSRD $0x02, 48(SI), X9 + PINSRD $0x03, 56(SI), X9 + MOVL 8(SI), X10 + PINSRD $0x01, 20(SI), X10 + PINSRD $0x02, 16(SI), X10 + PINSRD $0x03, 60(SI), X10 + MOVL 24(SI), X11 + PINSRD $0x01, 40(SI), X11 + PINSRD $0x02, (SI), X11 + PINSRD $0x03, 32(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 36(SI), X8 + PINSRD $0x01, 20(SI), X8 + PINSRD $0x02, 8(SI), X8 + PINSRD $0x03, 40(SI), X8 + MOVL (SI), X9 + PINSRD $0x01, 28(SI), X9 + PINSRD $0x02, 16(SI), X9 + PINSRD $0x03, 60(SI), X9 + MOVL 56(SI), X10 + PINSRD $0x01, 44(SI), X10 + PINSRD $0x02, 24(SI), X10 + PINSRD $0x03, 12(SI), X10 + MOVL 4(SI), X11 + PINSRD $0x01, 48(SI), X11 + PINSRD $0x02, 32(SI), X11 + PINSRD $0x03, 52(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 8(SI), X8 + PINSRD $0x01, 24(SI), X8 + PINSRD $0x02, (SI), X8 + PINSRD $0x03, 32(SI), X8 + MOVL 48(SI), X9 + PINSRD $0x01, 40(SI), X9 + PINSRD $0x02, 44(SI), X9 + PINSRD $0x03, 12(SI), X9 + MOVL 16(SI), X10 + PINSRD $0x01, 28(SI), X10 + PINSRD $0x02, 60(SI), X10 + PINSRD $0x03, 4(SI), X10 + MOVL 52(SI), X11 + PINSRD $0x01, 20(SI), X11 + PINSRD $0x02, 56(SI), X11 + PINSRD $0x03, 36(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 48(SI), X8 + PINSRD $0x01, 4(SI), X8 + PINSRD $0x02, 56(SI), X8 + PINSRD $0x03, 16(SI), X8 + MOVL 20(SI), X9 + PINSRD $0x01, 60(SI), X9 + PINSRD $0x02, 52(SI), X9 + PINSRD $0x03, 40(SI), X9 + MOVL (SI), X10 + PINSRD $0x01, 24(SI), X10 + PINSRD $0x02, 36(SI), X10 + PINSRD $0x03, 32(SI), X10 + MOVL 28(SI), X11 + PINSRD $0x01, 12(SI), X11 + PINSRD $0x02, 8(SI), X11 + PINSRD $0x03, 44(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 52(SI), X8 + PINSRD $0x01, 28(SI), X8 + PINSRD $0x02, 48(SI), X8 + PINSRD $0x03, 12(SI), X8 + MOVL 44(SI), X9 + PINSRD $0x01, 56(SI), X9 + PINSRD $0x02, 4(SI), X9 + PINSRD $0x03, 36(SI), X9 + MOVL 20(SI), X10 + PINSRD $0x01, 60(SI), X10 + PINSRD $0x02, 32(SI), X10 + PINSRD $0x03, 8(SI), X10 + MOVL (SI), X11 + PINSRD $0x01, 16(SI), X11 + PINSRD $0x02, 24(SI), X11 + PINSRD $0x03, 40(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 24(SI), X8 + PINSRD $0x01, 56(SI), X8 + PINSRD $0x02, 44(SI), X8 + PINSRD $0x03, (SI), X8 + MOVL 60(SI), X9 + PINSRD $0x01, 36(SI), X9 + PINSRD $0x02, 12(SI), X9 + PINSRD $0x03, 32(SI), X9 + MOVL 48(SI), X10 + PINSRD $0x01, 52(SI), X10 + PINSRD $0x02, 4(SI), X10 + PINSRD $0x03, 40(SI), X10 + MOVL 8(SI), X11 + PINSRD $0x01, 28(SI), X11 + PINSRD $0x02, 16(SI), X11 + PINSRD $0x03, 20(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 40(SI), X8 + PINSRD $0x01, 32(SI), X8 + PINSRD $0x02, 28(SI), X8 + PINSRD $0x03, 4(SI), X8 + MOVL 8(SI), X9 + PINSRD $0x01, 16(SI), X9 + PINSRD $0x02, 24(SI), X9 + PINSRD $0x03, 20(SI), X9 + MOVL 60(SI), X10 + PINSRD $0x01, 36(SI), X10 + PINSRD $0x02, 12(SI), X10 + PINSRD $0x03, 52(SI), X10 + MOVL 44(SI), X11 + PINSRD $0x01, 56(SI), X11 + PINSRD $0x02, 48(SI), X11 + PINSRD $0x03, (SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) RET From 0484c26df710dd85a038090191bda8c385afdd49 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Wed, 24 Jul 2024 23:04:06 -0400 Subject: [PATCH 34/57] blake2b: port blake2bAVX2_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="blake2b/blake2bAVX2_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: Ia2af1b82c871e26b89bd9a2d9fb187cc49e18341 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600836 Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI --- blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go | 1228 +++++ blake2b/_asm/AVX2/go.mod | 16 + blake2b/_asm/AVX2/go.sum | 12 + blake2b/blake2bAVX2_amd64.s | 5167 +++++++++++++++++--- 4 files changed, 5747 insertions(+), 676 deletions(-) create mode 100644 blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go create mode 100644 blake2b/_asm/AVX2/go.mod create mode 100644 blake2b/_asm/AVX2/go.sum diff --git a/blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go b/blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go new file mode 100644 index 0000000000..c297c0ca63 --- /dev/null +++ b/blake2b/_asm/AVX2/blake2bAVX2_amd64_asm.go @@ -0,0 +1,1228 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/blake2b" +) + +//go:generate go run . -out ../../blake2bAVX2_amd64.s -pkg blake2b + +const ThatPeskyUnicodeDot = "\u00b7" + +func main() { + Package("golang.org/x/crypto/blake2b") + ConstraintExpr("amd64,gc,!purego") + hashBlocksAVX2() + hashBlocksAVX() + Generate() +} + +// Utility function to emit a BYTE instruction +func BYTE(imm Op) { + Instruction(&ir.Instruction{Opcode: "BYTE", Operands: []Op{imm}}) +} + +func VPERMQ_0x39_Y1_Y1() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xc9)) + BYTE(U8(0x39)) +} + +func VPERMQ_0x93_Y1_Y1() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xc9)) + BYTE(U8(0x93)) +} + +func VPERMQ_0x4E_Y2_Y2() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xd2)) + BYTE(U8(0x4e)) +} + +func VPERMQ_0x93_Y3_Y3() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xdb)) + BYTE(U8(0x93)) +} + +func VPERMQ_0x39_Y3_Y3() { + BYTE(U8(0xc4)) + BYTE(U8(0xe3)) + BYTE(U8(0xfd)) + BYTE(U8(0x00)) + BYTE(U8(0xdb)) + BYTE(U8(0x39)) +} + +func ROUND_AVX2(m0, m1, m2, m3 Op, t, c40, c48 VecPhysical) { + VPADDQ(m0, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFD(I8(-79), Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPSHUFB(c40, Y1, Y1) + VPADDQ(m1, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFB(c48, Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPADDQ(Y1, Y1, t) + VPSRLQ(Imm(63), Y1, Y1) + VPXOR(t, Y1, Y1) + VPERMQ_0x39_Y1_Y1() + VPERMQ_0x4E_Y2_Y2() + VPERMQ_0x93_Y3_Y3() + VPADDQ(m2, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFD(I8(-79), Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPSHUFB(c40, Y1, Y1) + VPADDQ(m3, Y0, Y0) + VPADDQ(Y1, Y0, Y0) + VPXOR(Y0, Y3, Y3) + VPSHUFB(c48, Y3, Y3) + VPADDQ(Y3, Y2, Y2) + VPXOR(Y2, Y1, Y1) + VPADDQ(Y1, Y1, t) + VPSRLQ(Imm(63), Y1, Y1) + VPXOR(t, Y1, Y1) + VPERMQ_0x39_Y3_Y3() + VPERMQ_0x4E_Y2_Y2() + VPERMQ_0x93_Y1_Y1() +} + +func VMOVQ_SI_X11_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x1E)) +} + +func VMOVQ_SI_X12_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x26)) +} + +func VMOVQ_SI_X13_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x2E)) +} + +func VMOVQ_SI_X14_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x36)) +} + +func VMOVQ_SI_X15_0() { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x3E)) +} + +func VMOVQ_SI_X11(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x5E)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X12(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x66)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X13(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x6E)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X14(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x76)) + BYTE(U8(n)) +} + +func VMOVQ_SI_X15(n uint8) { + BYTE(U8(0xC5)) + BYTE(U8(0x7A)) + BYTE(U8(0x7E)) + BYTE(U8(0x7E)) + BYTE(U8(n)) +} + +func VPINSRQ_1_SI_X11_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0xA1)) + BYTE(U8(0x22)) + BYTE(U8(0x1E)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X12_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x99)) + BYTE(U8(0x22)) + BYTE(U8(0x26)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X13_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x91)) + BYTE(U8(0x22)) + BYTE(U8(0x2E)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X14_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x89)) + BYTE(U8(0x22)) + BYTE(U8(0x36)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X15_0() { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x81)) + BYTE(U8(0x22)) + BYTE(U8(0x3E)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X11(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0xA1)) + BYTE(U8(0x22)) + BYTE(U8(0x5E)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X12(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x99)) + BYTE(U8(0x22)) + BYTE(U8(0x66)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X13(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x91)) + BYTE(U8(0x22)) + BYTE(U8(0x6E)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X14(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x89)) + BYTE(U8(0x22)) + BYTE(U8(0x76)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VPINSRQ_1_SI_X15(n uint8) { + BYTE(U8(0xC4)) + BYTE(U8(0x63)) + BYTE(U8(0x81)) + BYTE(U8(0x22)) + BYTE(U8(0x7E)) + BYTE(U8(n)) + BYTE(U8(0x01)) +} + +func VMOVQ_R8_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x41)) + BYTE(U8(0xF9)) + BYTE(U8(0x6E)) + BYTE(U8(0xF8)) +} + +func VPINSRQ_1_R9_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x43)) + BYTE(U8(0x81)) + BYTE(U8(0x22)) + BYTE(U8(0xF9)) + BYTE(U8(0x01)) +} + +// load msg: +// +// Y12 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y12(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X12(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X12(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) +} + +// load msg: +// +// Y13 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y13(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X13(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X13(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) +} + +// load msg: +// +// Y14 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y14(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X14(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X14(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) +} + +// load msg: +// +// Y15 = (i0, i1, i2, i3) +// +// i0, i1, i2, i3 must not be 0 +func LOAD_MSG_AVX2_Y15(i0, i1, i2, i3 uint8) { + VMOVQ_SI_X15(i0 * 8) + VMOVQ_SI_X11(i2 * 8) + VPINSRQ_1_SI_X15(i1 * 8) + VPINSRQ_1_SI_X11(i3 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() { + VMOVQ_SI_X12_0() + VMOVQ_SI_X11(4 * 8) + VPINSRQ_1_SI_X12(2 * 8) + VPINSRQ_1_SI_X11(6 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) + LOAD_MSG_AVX2_Y13(1, 3, 5, 7) + LOAD_MSG_AVX2_Y14(8, 10, 12, 14) + LOAD_MSG_AVX2_Y15(9, 11, 13, 15) +} + +func LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() { + LOAD_MSG_AVX2_Y12(14, 4, 9, 13) + LOAD_MSG_AVX2_Y13(10, 8, 15, 6) + VMOVQ_SI_X11(11 * 8) + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(0*8), X14) + VPINSRQ_1_SI_X11(5 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) + LOAD_MSG_AVX2_Y15(12, 2, 7, 3) +} + +func LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() { + VMOVQ_SI_X11(5 * 8) + VMOVDQU(Mem{Base: SI}.Offset(11*8), X12) + VPINSRQ_1_SI_X11(15 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) + VMOVQ_SI_X13(8 * 8) + VMOVQ_SI_X11(2 * 8) + VPINSRQ_1_SI_X13_0() + VPINSRQ_1_SI_X11(13 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) + LOAD_MSG_AVX2_Y14(10, 3, 7, 9) + LOAD_MSG_AVX2_Y15(14, 6, 1, 4) +} + +func LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() { + LOAD_MSG_AVX2_Y12(7, 3, 13, 11) + LOAD_MSG_AVX2_Y13(9, 1, 12, 14) + LOAD_MSG_AVX2_Y14(2, 5, 4, 15) + VMOVQ_SI_X15(6 * 8) + VMOVQ_SI_X11_0() + VPINSRQ_1_SI_X15(10 * 8) + VPINSRQ_1_SI_X11(8 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() { + LOAD_MSG_AVX2_Y12(9, 5, 2, 10) + VMOVQ_SI_X13_0() + VMOVQ_SI_X11(4 * 8) + VPINSRQ_1_SI_X13(7 * 8) + VPINSRQ_1_SI_X11(15 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) + LOAD_MSG_AVX2_Y14(14, 11, 6, 3) + LOAD_MSG_AVX2_Y15(1, 12, 8, 13) +} + +func LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() { + VMOVQ_SI_X12(2 * 8) + VMOVQ_SI_X11_0() + VPINSRQ_1_SI_X12(6 * 8) + VPINSRQ_1_SI_X11(8 * 8) + VINSERTI128(Imm(1), X11, Y12, Y12) + LOAD_MSG_AVX2_Y13(12, 10, 11, 3) + LOAD_MSG_AVX2_Y14(4, 7, 15, 1) + LOAD_MSG_AVX2_Y15(13, 5, 14, 9) +} + +func LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() { + LOAD_MSG_AVX2_Y12(12, 1, 14, 4) + LOAD_MSG_AVX2_Y13(5, 15, 13, 10) + VMOVQ_SI_X14_0() + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(8*8), X11) + VPINSRQ_1_SI_X14(6 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) + LOAD_MSG_AVX2_Y15(7, 3, 2, 11) +} + +func LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() { + LOAD_MSG_AVX2_Y12(13, 7, 12, 3) + LOAD_MSG_AVX2_Y13(11, 14, 1, 9) + LOAD_MSG_AVX2_Y14(5, 15, 8, 2) + VMOVQ_SI_X15_0() + VMOVQ_SI_X11(6 * 8) + VPINSRQ_1_SI_X15(4 * 8) + VPINSRQ_1_SI_X11(10 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() { + VMOVQ_SI_X12(6 * 8) + VMOVQ_SI_X11(11 * 8) + VPINSRQ_1_SI_X12(14 * 8) + VPINSRQ_1_SI_X11_0() + VINSERTI128(Imm(1), X11, Y12, Y12) + LOAD_MSG_AVX2_Y13(15, 9, 3, 8) + VMOVQ_SI_X11(1 * 8) + VMOVDQU(Mem{Base: SI}.Offset(12*8), X14) + VPINSRQ_1_SI_X11(10 * 8) + VINSERTI128(Imm(1), X11, Y14, Y14) + VMOVQ_SI_X15(2 * 8) + VMOVDQU(Mem{Base: SI}.Offset(4*8), X11) + VPINSRQ_1_SI_X15(7 * 8) + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() { + LOAD_MSG_AVX2_Y12(10, 8, 7, 1) + VMOVQ_SI_X13(2 * 8) + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(5*8), X11) + VPINSRQ_1_SI_X13(4 * 8) + VINSERTI128(Imm(1), X11, Y13, Y13) + LOAD_MSG_AVX2_Y14(15, 9, 3, 13) + VMOVQ_SI_X15(11 * 8) + VMOVQ_SI_X11(12 * 8) + VPINSRQ_1_SI_X15(14 * 8) + VPINSRQ_1_SI_X11_0() + VINSERTI128(Imm(1), X11, Y15, Y15) +} + +func hashBlocksAVX2() { + Implement("hashBlocksAVX2") + Attributes(4) + AllocLocal(320) // frame size = 288 + 32 byte alignment + + Load(Param("h"), RAX) + Load(Param("c"), RBX) + Load(Param("flag"), RCX) + Load(Param("blocks").Base(), RSI) + Load(Param("blocks").Len(), RDI) + + MOVQ(RSP, RDX) + ADDQ(I32(31), RDX) + ANDQ(I32(^31), RDX) + + MOVQ(RCX, Mem{Base: DX}.Offset(16)) + XORQ(RCX, RCX) + MOVQ(RCX, Mem{Base: DX}.Offset(24)) + + AVX2_c40 := AVX2_c40_DATA() + AVX2_c48 := AVX2_c48_DATA() + VMOVDQU(AVX2_c40, Y4) + VMOVDQU(AVX2_c48, Y5) + + VMOVDQU(Mem{Base: AX}.Offset(0), Y8) + VMOVDQU(Mem{Base: AX}.Offset(32), Y9) + AVX2_iv0 := AVX2_iv0_DATA() + AVX2_iv1 := AVX2_iv1_DATA() + VMOVDQU(AVX2_iv0, Y6) + VMOVDQU(AVX2_iv1, Y7) + + MOVQ(Mem{Base: BX}.Offset(0), R8) + MOVQ(Mem{Base: BX}.Offset(8), R9) + MOVQ(R9, Mem{Base: DX}.Offset(8)) + + loop_AVX2() + noinc_AVX2() +} + +func loop_AVX2() { + Label("loop") + ADDQ(Imm(128), R8) + MOVQ(R8, Mem{Base: DX}.Offset(0)) + CMPQ(R8, Imm(128)) + JGE(LabelRef("noinc")) + INCQ(R9) + MOVQ(R9, Mem{Base: DX}.Offset(8)) +} + +// line 312 +func noinc_AVX2() { + Label("noinc") + VMOVDQA(Y8, Y0) + VMOVDQA(Y9, Y1) + VMOVDQA(Y6, Y2) + VPXOR(Mem{Base: DX}.Offset(0), Y7, Y3) + + LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() + VMOVDQA(Y12, Mem{Base: DX}.Offset(32)) + VMOVDQA(Y13, Mem{Base: DX}.Offset(64)) + VMOVDQA(Y14, Mem{Base: DX}.Offset(96)) + VMOVDQA(Y15, Mem{Base: DX}.Offset(128)) + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() + VMOVDQA(Y12, Mem{Base: DX}.Offset(160)) + VMOVDQA(Y13, Mem{Base: DX}.Offset(192)) + VMOVDQA(Y14, Mem{Base: DX}.Offset(224)) + VMOVDQA(Y15, Mem{Base: DX}.Offset(256)) + + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + + ROUND_AVX2(Mem{Base: DX}.Offset(32), Mem{Base: DX}.Offset(64), Mem{Base: DX}.Offset(96), Mem{Base: DX}.Offset(128), Y10, Y4, Y5) + ROUND_AVX2(Mem{Base: DX}.Offset(160), Mem{Base: DX}.Offset(192), Mem{Base: DX}.Offset(224), Mem{Base: DX}.Offset(256), Y10, Y4, Y5) + + VPXOR(Y0, Y8, Y8) + VPXOR(Y1, Y9, Y9) + VPXOR(Y2, Y8, Y8) + VPXOR(Y3, Y9, Y9) + + LEAQ(Mem{Base: SI}.Offset(128), RSI) + SUBQ(Imm(128), RDI) + JNE(LabelRef("loop")) + + MOVQ(R8, Mem{Base: BX}.Offset(0)) + MOVQ(R9, Mem{Base: BX}.Offset(8)) + + VMOVDQU(Y8, Mem{Base: AX}.Offset(0)) + VMOVDQU(Y9, Mem{Base: AX}.Offset(32)) + VZEROUPPER() + + RET() +} + +func VPUNPCKLQDQ_X2_X2_X15() { + BYTE(U8(0xC5)) + BYTE(U8(0x69)) + BYTE(U8(0x6C)) + BYTE(U8(0xFA)) +} + +func VPUNPCKLQDQ_X3_X3_X15() { + BYTE(U8(0xC5)) + BYTE(U8(0x61)) + BYTE(U8(0x6C)) + BYTE(U8(0xFB)) +} + +func VPUNPCKLQDQ_X7_X7_X15() { + BYTE(U8(0xC5)) + BYTE(U8(0x41)) + BYTE(U8(0x6C)) + BYTE(U8(0xFF)) +} + +func VPUNPCKLQDQ_X13_X13_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x41)) + BYTE(U8(0x11)) + BYTE(U8(0x6C)) + BYTE(U8(0xFD)) +} + +func VPUNPCKLQDQ_X14_X14_X15() { + BYTE(U8(0xC4)) + BYTE(U8(0x41)) + BYTE(U8(0x09)) + BYTE(U8(0x6C)) + BYTE(U8(0xFE)) +} + +func VPUNPCKHQDQ_X15_X2_X2() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x69)) + BYTE(U8(0x6D)) + BYTE(U8(0xD7)) +} + +func VPUNPCKHQDQ_X15_X3_X3() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x61)) + BYTE(U8(0x6D)) + BYTE(U8(0xDF)) +} + +func VPUNPCKHQDQ_X15_X6_X6() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x49)) + BYTE(U8(0x6D)) + BYTE(U8(0xF7)) +} + +func VPUNPCKHQDQ_X15_X7_X7() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x41)) + BYTE(U8(0x6D)) + BYTE(U8(0xFF)) +} + +func VPUNPCKHQDQ_X15_X3_X2() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x61)) + BYTE(U8(0x6D)) + BYTE(U8(0xD7)) +} + +func VPUNPCKHQDQ_X15_X7_X6() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x41)) + BYTE(U8(0x6D)) + BYTE(U8(0xF7)) +} + +func VPUNPCKHQDQ_X15_X13_X3() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x11)) + BYTE(U8(0x6D)) + BYTE(U8(0xDF)) +} + +func VPUNPCKHQDQ_X15_X13_X7() { + BYTE(U8(0xC4)) + BYTE(U8(0xC1)) + BYTE(U8(0x11)) + BYTE(U8(0x6D)) + BYTE(U8(0xFF)) +} + +func SHUFFLE_AVX() { + VMOVDQA(X6, X13) + VMOVDQA(X2, X14) + VMOVDQA(X4, X6) + VPUNPCKLQDQ_X13_X13_X15() + VMOVDQA(X5, X4) + VMOVDQA(X6, X5) + VPUNPCKHQDQ_X15_X7_X6() + VPUNPCKLQDQ_X7_X7_X15() + VPUNPCKHQDQ_X15_X13_X7() + VPUNPCKLQDQ_X3_X3_X15() + VPUNPCKHQDQ_X15_X2_X2() + VPUNPCKLQDQ_X14_X14_X15() + VPUNPCKHQDQ_X15_X3_X3() +} + +func SHUFFLE_AVX_INV() { + VMOVDQA(X2, X13) + VMOVDQA(X4, X14) + VPUNPCKLQDQ_X2_X2_X15() + VMOVDQA(X5, X4) + VPUNPCKHQDQ_X15_X3_X2() + VMOVDQA(X14, X5) + VPUNPCKLQDQ_X3_X3_X15() + VMOVDQA(X6, X14) + VPUNPCKHQDQ_X15_X13_X3() + VPUNPCKLQDQ_X7_X7_X15() + VPUNPCKHQDQ_X15_X6_X6() + VPUNPCKLQDQ_X14_X14_X15() + VPUNPCKHQDQ_X15_X7_X7() +} + +func HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7 VecPhysical, m0, m1, m2, m3 Op, t0, c40, c48 VecPhysical) { + VPADDQ(m0, v0, v0) + VPADDQ(v2, v0, v0) + VPADDQ(m1, v1, v1) + VPADDQ(v3, v1, v1) + VPXOR(v0, v6, v6) + VPXOR(v1, v7, v7) + VPSHUFD(I8(-79), v6, v6) + VPSHUFD(I8(-79), v7, v7) + VPADDQ(v6, v4, v4) + VPADDQ(v7, v5, v5) + VPXOR(v4, v2, v2) + VPXOR(v5, v3, v3) + VPSHUFB(c40, v2, v2) + VPSHUFB(c40, v3, v3) + VPADDQ(m2, v0, v0) + VPADDQ(v2, v0, v0) + VPADDQ(m3, v1, v1) + VPADDQ(v3, v1, v1) + VPXOR(v0, v6, v6) + VPXOR(v1, v7, v7) + VPSHUFB(c48, v6, v6) + VPSHUFB(c48, v7, v7) + VPADDQ(v6, v4, v4) + VPADDQ(v7, v5, v5) + VPXOR(v4, v2, v2) + VPXOR(v5, v3, v3) + VPADDQ(v2, v2, t0) + VPSRLQ(Imm(63), v2, v2) + VPXOR(t0, v2, v2) + VPADDQ(v3, v3, t0) + VPSRLQ(Imm(63), v3, v3) + VPXOR(t0, v3, v3) +} + +// load msg: +// +// X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) +// +// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 +func LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7 uint8) { + VMOVQ_SI_X12(i0 * 8) + VMOVQ_SI_X13(i2 * 8) + VMOVQ_SI_X14(i4 * 8) + VMOVQ_SI_X15(i6 * 8) + VPINSRQ_1_SI_X12(i1 * 8) + VPINSRQ_1_SI_X13(i3 * 8) + VPINSRQ_1_SI_X14(i5 * 8) + VPINSRQ_1_SI_X15(i7 * 8) +} + +// load msg: +// +// X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) +func LOAD_MSG_AVX_0_2_4_6_1_3_5_7() { + VMOVQ_SI_X12_0() + VMOVQ_SI_X13(4 * 8) + VMOVQ_SI_X14(1 * 8) + VMOVQ_SI_X15(5 * 8) + VPINSRQ_1_SI_X12(2 * 8) + VPINSRQ_1_SI_X13(6 * 8) + VPINSRQ_1_SI_X14(3 * 8) + VPINSRQ_1_SI_X15(7 * 8) +} + +// load msg: +// +// X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) +func LOAD_MSG_AVX_1_0_11_5_12_2_7_3() { + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(0*8), X12) + VMOVQ_SI_X13(11 * 8) + VMOVQ_SI_X14(12 * 8) + VMOVQ_SI_X15(7 * 8) + VPINSRQ_1_SI_X13(5 * 8) + VPINSRQ_1_SI_X14(2 * 8) + VPINSRQ_1_SI_X15(3 * 8) +} + +// load msg: +// +// X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) +func LOAD_MSG_AVX_11_12_5_15_8_0_2_13() { + VMOVDQU(Mem{Base: SI}.Offset(11*8), X12) + VMOVQ_SI_X13(5 * 8) + VMOVQ_SI_X14(8 * 8) + VMOVQ_SI_X15(2 * 8) + VPINSRQ_1_SI_X13(15 * 8) + VPINSRQ_1_SI_X14_0() + VPINSRQ_1_SI_X15(13 * 8) +} + +// load msg: +// +// X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) +func LOAD_MSG_AVX_2_5_4_15_6_10_0_8() { + VMOVQ_SI_X12(2 * 8) + VMOVQ_SI_X13(4 * 8) + VMOVQ_SI_X14(6 * 8) + VMOVQ_SI_X15_0() + VPINSRQ_1_SI_X12(5 * 8) + VPINSRQ_1_SI_X13(15 * 8) + VPINSRQ_1_SI_X14(10 * 8) + VPINSRQ_1_SI_X15(8 * 8) +} + +// load msg: +// +// X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) +func LOAD_MSG_AVX_9_5_2_10_0_7_4_15() { + VMOVQ_SI_X12(9 * 8) + VMOVQ_SI_X13(2 * 8) + VMOVQ_SI_X14_0() + VMOVQ_SI_X15(4 * 8) + VPINSRQ_1_SI_X12(5 * 8) + VPINSRQ_1_SI_X13(10 * 8) + VPINSRQ_1_SI_X14(7 * 8) + VPINSRQ_1_SI_X15(15 * 8) +} + +// load msg: +// +// X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) +func LOAD_MSG_AVX_2_6_0_8_12_10_11_3() { + VMOVQ_SI_X12(2 * 8) + VMOVQ_SI_X13_0() + VMOVQ_SI_X14(12 * 8) + VMOVQ_SI_X15(11 * 8) + VPINSRQ_1_SI_X12(6 * 8) + VPINSRQ_1_SI_X13(8 * 8) + VPINSRQ_1_SI_X14(10 * 8) + VPINSRQ_1_SI_X15(3 * 8) +} + +// load msg: +// +// X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) +func LOAD_MSG_AVX_0_6_9_8_7_3_2_11() { + MOVQ(Mem{Base: SI}.Offset(0*8), X12) + VPSHUFD(Imm(0x4E), Mem{Base: SI}.Offset(8*8), X13) + MOVQ(Mem{Base: SI}.Offset(7*8), X14) + MOVQ(Mem{Base: SI}.Offset(2*8), X15) + VPINSRQ_1_SI_X12(6 * 8) + VPINSRQ_1_SI_X14(3 * 8) + VPINSRQ_1_SI_X15(11 * 8) +} + +// load msg: +// +// X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) +func LOAD_MSG_AVX_6_14_11_0_15_9_3_8() { + MOVQ(Mem{Base: SI}.Offset(6*8), X12) + MOVQ(Mem{Base: SI}.Offset(11*8), X13) + MOVQ(Mem{Base: SI}.Offset(15*8), X14) + MOVQ(Mem{Base: SI}.Offset(3*8), X15) + VPINSRQ_1_SI_X12(14 * 8) + VPINSRQ_1_SI_X13_0() + VPINSRQ_1_SI_X14(9 * 8) + VPINSRQ_1_SI_X15(8 * 8) +} + +// load msg: +// +// X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) +func LOAD_MSG_AVX_5_15_8_2_0_4_6_10() { + MOVQ(Mem{Base: SI}.Offset(5*8), X12) + MOVQ(Mem{Base: SI}.Offset(8*8), X13) + MOVQ(Mem{Base: SI}.Offset(0*8), X14) + MOVQ(Mem{Base: SI}.Offset(6*8), X15) + VPINSRQ_1_SI_X12(15 * 8) + VPINSRQ_1_SI_X13(2 * 8) + VPINSRQ_1_SI_X14(4 * 8) + VPINSRQ_1_SI_X15(10 * 8) +} + +// load msg: +// +// X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) +func LOAD_MSG_AVX_12_13_1_10_2_7_4_5() { + VMOVDQU(Mem{Base: SI}.Offset(12*8), X12) + MOVQ(Mem{Base: SI}.Offset(1*8), X13) + MOVQ(Mem{Base: SI}.Offset(2*8), X14) + VPINSRQ_1_SI_X13(10 * 8) + VPINSRQ_1_SI_X14(7 * 8) + VMOVDQU(Mem{Base: SI}.Offset(4*8), X15) +} + +// load msg: +// +// X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) +func LOAD_MSG_AVX_15_9_3_13_11_14_12_0() { + MOVQ(Mem{Base: SI}.Offset(15*8), X12) + MOVQ(Mem{Base: SI}.Offset(3*8), X13) + MOVQ(Mem{Base: SI}.Offset(11*8), X14) + MOVQ(Mem{Base: SI}.Offset(12*8), X15) + VPINSRQ_1_SI_X12(9 * 8) + VPINSRQ_1_SI_X13(13 * 8) + VPINSRQ_1_SI_X14(14 * 8) + VPINSRQ_1_SI_X15_0() +} + +func hashBlocksAVX() { + Implement("hashBlocksAVX") + Attributes(4) + AllocLocal(288) // frame size = 272 + 16 byte alignment + + Load(Param("h"), RAX) + Load(Param("c"), RBX) + Load(Param("flag"), RCX) + Load(Param("blocks").Base(), RSI) + Load(Param("blocks").Len(), RDI) + + MOVQ(RSP, R10) + ADDQ(Imm(15), R10) + ANDQ(I32(^15), R10) + + AVX_c40 := AVX_c40_DATA() + AVX_c48 := AVX_c48_DATA() + VMOVDQU(AVX_c40, X0) + VMOVDQU(AVX_c48, X1) + VMOVDQA(X0, X8) + VMOVDQA(X1, X9) + + AVX_iv3 := AVX_iv3_DATA() + VMOVDQU(AVX_iv3, X0) + VMOVDQA(X0, Mem{Base: R10}.Offset(0)) + XORQ(RCX, Mem{Base: R10}.Offset(0)) // 0(R10) = ·AVX_iv3 ^ (CX || 0) + + VMOVDQU(Mem{Base: AX}.Offset(0), X10) + VMOVDQU(Mem{Base: AX}.Offset(16), X11) + VMOVDQU(Mem{Base: AX}.Offset(32), X2) + VMOVDQU(Mem{Base: AX}.Offset(48), X3) + + MOVQ(Mem{Base: BX}.Offset(0), R8) + MOVQ(Mem{Base: BX}.Offset(8), R9) + + loop_AVX() + noinc_AVX() +} + +func loop_AVX() { + Label("loop") + ADDQ(Imm(128), R8) + CMPQ(R8, Imm(128)) + JGE(LabelRef("noinc")) + INCQ(R9) +} + +func noinc_AVX() { + Label("noinc") + VMOVQ_R8_X15() + VPINSRQ_1_R9_X15() + + AVX_iv0 := AVX_iv0_DATA() + AVX_iv1 := AVX_iv1_DATA() + AVX_iv2 := AVX_iv2_DATA() + VMOVDQA(X10, X0) + VMOVDQA(X11, X1) + VMOVDQU(AVX_iv0, X4) + VMOVDQU(AVX_iv1, X5) + VMOVDQU(AVX_iv2, X6) + + VPXOR(X15, X6, X6) + VMOVDQA(Mem{Base: R10}.Offset(0), X7) + + LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA(X12, Mem{Base: R10}.Offset(16)) + VMOVDQA(X13, Mem{Base: R10}.Offset(32)) + VMOVDQA(X14, Mem{Base: R10}.Offset(48)) + VMOVDQA(X15, Mem{Base: R10}.Offset(64)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VMOVDQA(X12, Mem{Base: R10}.Offset(80)) + VMOVDQA(X13, Mem{Base: R10}.Offset(96)) + VMOVDQA(X14, Mem{Base: R10}.Offset(112)) + VMOVDQA(X15, Mem{Base: R10}.Offset(128)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VMOVDQA(X12, Mem{Base: R10}.Offset(144)) + VMOVDQA(X13, Mem{Base: R10}.Offset(160)) + VMOVDQA(X14, Mem{Base: R10}.Offset(176)) + VMOVDQA(X15, Mem{Base: R10}.Offset(192)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VMOVDQA(X12, Mem{Base: R10}.Offset(208)) + VMOVDQA(X13, Mem{Base: R10}.Offset(224)) + VMOVDQA(X14, Mem{Base: R10}.Offset(240)) + VMOVDQA(X15, Mem{Base: R10}.Offset(256)) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_11_12_5_15_8_0_2_13() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_2_5_4_15_6_10_0_8() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_9_5_2_10_0_7_4_15() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_2_6_0_8_12_10_11_3() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_0_6_9_8_7_3_2_11() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_5_15_8_2_0_4_6_10() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_6_14_11_0_15_9_3_8() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_12_13_1_10_2_7_4_5() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_15_9_3_13_11_14_12_0() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(16), Mem{Base: R10}.Offset(32), Mem{Base: R10}.Offset(48), Mem{Base: R10}.Offset(64), X15, X8, X9) + SHUFFLE_AVX() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(80), Mem{Base: R10}.Offset(96), Mem{Base: R10}.Offset(112), Mem{Base: R10}.Offset(128), X15, X8, X9) + SHUFFLE_AVX_INV() + + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(144), Mem{Base: R10}.Offset(160), Mem{Base: R10}.Offset(176), Mem{Base: R10}.Offset(192), X15, X8, X9) + SHUFFLE_AVX() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(208), Mem{Base: R10}.Offset(224), Mem{Base: R10}.Offset(240), Mem{Base: R10}.Offset(256), X15, X8, X9) + SHUFFLE_AVX_INV() + + VMOVDQU(Mem{Base: AX}.Offset(32), X14) + VMOVDQU(Mem{Base: AX}.Offset(48), X15) + VPXOR(X0, X10, X10) + VPXOR(X1, X11, X11) + VPXOR(X2, X14, X14) + VPXOR(X3, X15, X15) + VPXOR(X4, X10, X10) + VPXOR(X5, X11, X11) + VPXOR(X6, X14, X2) + VPXOR(X7, X15, X3) + VMOVDQU(X2, Mem{Base: AX}.Offset(32)) + VMOVDQU(X3, Mem{Base: AX}.Offset(48)) + + LEAQ(Mem{Base: SI}.Offset(128), RSI) + SUBQ(Imm(128), RDI) + JNE(LabelRef("loop")) + + VMOVDQU(X10, Mem{Base: AX}.Offset(0)) + VMOVDQU(X11, Mem{Base: AX}.Offset(16)) + + MOVQ(R8, Mem{Base: BX}.Offset(0)) + MOVQ(R9, Mem{Base: BX}.Offset(8)) + VZEROUPPER() + + RET() +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var ( + AVX2_iv0_ptr, + AVX2_iv1_ptr, + AVX2_c40_ptr, + AVX2_c48_ptr, + + AVX_iv0_ptr, + AVX_iv1_ptr, + AVX_iv2_ptr, + AVX_iv3_ptr, + AVX_c40_ptr, + AVX_c48_ptr *Mem +) + +func AVX2_iv0_DATA() Mem { + if AVX2_iv0_ptr != nil { + return *AVX2_iv0_ptr + } + AVX2_iv0 := GLOBL(ThatPeskyUnicodeDot+"AVX2_iv0", NOPTR|RODATA) + DATA(0x00, U64(0x6a09e667f3bcc908)) + DATA(0x08, U64(0xbb67ae8584caa73b)) + DATA(0x10, U64(0x3c6ef372fe94f82b)) + DATA(0x18, U64(0xa54ff53a5f1d36f1)) + return AVX2_iv0 +} + +func AVX2_iv1_DATA() Mem { + if AVX2_iv1_ptr != nil { + return *AVX2_iv1_ptr + } + AVX2_iv1 := GLOBL(ThatPeskyUnicodeDot+"AVX2_iv1", NOPTR|RODATA) + DATA(0x00, U64(0x510e527fade682d1)) + DATA(0x08, U64(0x9b05688c2b3e6c1f)) + DATA(0x10, U64(0x1f83d9abfb41bd6b)) + DATA(0x18, U64(0x5be0cd19137e2179)) + return AVX2_iv1 +} + +func AVX2_c40_DATA() Mem { + if AVX2_c40_ptr != nil { + return *AVX2_c40_ptr + } + AVX2_c40 := GLOBL(ThatPeskyUnicodeDot+"AVX2_c40", NOPTR|RODATA) + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + DATA(0x10, U64(0x0201000706050403)) + DATA(0x18, U64(0x0a09080f0e0d0c0b)) + return AVX2_c40 +} + +func AVX2_c48_DATA() Mem { + if AVX2_c48_ptr != nil { + return *AVX2_c48_ptr + } + AVX2_c48 := GLOBL(ThatPeskyUnicodeDot+"AVX2_c48", NOPTR|RODATA) + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + DATA(0x10, U64(0x0100070605040302)) + DATA(0x18, U64(0x09080f0e0d0c0b0a)) + return AVX2_c48 +} + +func AVX_iv0_DATA() Mem { + if AVX_iv0_ptr != nil { + return *AVX_iv0_ptr + } + AVX_iv0 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv0", NOPTR|RODATA) + DATA(0x00, U64(0x6a09e667f3bcc908)) + DATA(0x08, U64(0xbb67ae8584caa73b)) + return AVX_iv0 +} + +func AVX_iv1_DATA() Mem { + if AVX_iv1_ptr != nil { + return *AVX_iv1_ptr + } + AVX_iv1 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv1", NOPTR|RODATA) + DATA(0x00, U64(0x3c6ef372fe94f82b)) + DATA(0x08, U64(0xa54ff53a5f1d36f1)) + return AVX_iv1 +} + +func AVX_iv2_DATA() Mem { + if AVX_iv2_ptr != nil { + return *AVX_iv2_ptr + } + AVX_iv2 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv2", NOPTR|RODATA) + DATA(0x00, U64(0x510e527fade682d1)) + DATA(0x08, U64(0x9b05688c2b3e6c1f)) + return AVX_iv2 +} + +func AVX_iv3_DATA() Mem { + if AVX_iv3_ptr != nil { + return *AVX_iv3_ptr + } + AVX_iv3 := GLOBL(ThatPeskyUnicodeDot+"AVX_iv3", NOPTR|RODATA) + DATA(0x00, U64(0x1f83d9abfb41bd6b)) + DATA(0x08, U64(0x5be0cd19137e2179)) + return AVX_iv3 +} + +func AVX_c40_DATA() Mem { + if AVX_c40_ptr != nil { + return *AVX_c40_ptr + } + AVX_c40 := GLOBL(ThatPeskyUnicodeDot+"AVX_c40", NOPTR|RODATA) + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + return AVX_c40 +} + +func AVX_c48_DATA() Mem { + if AVX_c48_ptr != nil { + return *AVX_c48_ptr + } + AVX_c48 := GLOBL(ThatPeskyUnicodeDot+"AVX_c48", NOPTR|RODATA) + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + return AVX_c48 +} diff --git a/blake2b/_asm/AVX2/go.mod b/blake2b/_asm/AVX2/go.mod new file mode 100644 index 0000000000..c49f1b11ae --- /dev/null +++ b/blake2b/_asm/AVX2/go.mod @@ -0,0 +1,16 @@ +module blake2b/_asm/AVX2 + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 + +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/blake2b/_asm/AVX2/go.sum b/blake2b/_asm/AVX2/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/blake2b/_asm/AVX2/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/blake2b/blake2bAVX2_amd64.s b/blake2b/blake2bAVX2_amd64.s index 9ae8206c20..f75162e039 100644 --- a/blake2b/blake2bAVX2_amd64.s +++ b/blake2b/blake2bAVX2_amd64.s @@ -1,722 +1,4517 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 - -#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 -#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 -#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e -#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 -#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 - -#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ - VPADDQ m0, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m1, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y1_Y1; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y3_Y3; \ - VPADDQ m2, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m3, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y3_Y3; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y1_Y1 - -#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E -#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 -#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E -#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 -#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E - -#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n -#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n -#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n -#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n -#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n - -#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 -#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 -#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 -#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 -#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 - -#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 - -#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 -#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 - -// load msg: Y12 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y12, Y12 - -// load msg: Y13 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ - VMOVQ_SI_X13(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X13(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y13, Y13 - -// load msg: Y14 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ - VMOVQ_SI_X14(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X14(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y14, Y14 - -// load msg: Y15 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ - VMOVQ_SI_X15(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X15(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X11(6*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ - LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ - LOAD_MSG_AVX2_Y15(9, 11, 13, 15) - -#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ - LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ - LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ - VMOVQ_SI_X11(11*8); \ - VPSHUFD $0x4E, 0*8(SI), X14; \ - VPINSRQ_1_SI_X11(5*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(12, 2, 7, 3) - -#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ - VMOVQ_SI_X11(5*8); \ - VMOVDQU 11*8(SI), X12; \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - VMOVQ_SI_X13(8*8); \ - VMOVQ_SI_X11(2*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X11(13*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ - LOAD_MSG_AVX2_Y15(14, 6, 1, 4) - -#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ - LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ - LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ - LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ - VMOVQ_SI_X15(6*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X15(10*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ - LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X13(7*8); \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ - LOAD_MSG_AVX2_Y15(1, 12, 8, 13) - -#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ - LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ - LOAD_MSG_AVX2_Y15(13, 5, 14, 9) - -#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ - LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ - LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ - VMOVQ_SI_X14_0; \ - VPSHUFD $0x4E, 8*8(SI), X11; \ - VPINSRQ_1_SI_X14(6*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(7, 3, 2, 11) - -#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ - LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ - LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ - LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ - VMOVQ_SI_X15_0; \ - VMOVQ_SI_X11(6*8); \ - VPINSRQ_1_SI_X15(4*8); \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ - VMOVQ_SI_X12(6*8); \ - VMOVQ_SI_X11(11*8); \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ - VMOVQ_SI_X11(1*8); \ - VMOVDQU 12*8(SI), X14; \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - VMOVQ_SI_X15(2*8); \ - VMOVDQU 4*8(SI), X11; \ - VPINSRQ_1_SI_X15(7*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ - LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ - VMOVQ_SI_X13(2*8); \ - VPSHUFD $0x4E, 5*8(SI), X11; \ - VPINSRQ_1_SI_X13(4*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ - VMOVQ_SI_X15(11*8); \ - VMOVQ_SI_X11(12*8); \ - VPINSRQ_1_SI_X15(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y15, Y15 - // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, DX - ADDQ $31, DX - ANDQ $~31, DX - - MOVQ CX, 16(DX) - XORQ CX, CX - MOVQ CX, 24(DX) - - VMOVDQU ·AVX2_c40<>(SB), Y4 - VMOVDQU ·AVX2_c48<>(SB), Y5 - - VMOVDQU 0(AX), Y8 +// Requires: AVX, AVX2 +TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, DX + ADDQ $+31, DX + ANDQ $-32, DX + MOVQ CX, 16(DX) + XORQ CX, CX + MOVQ CX, 24(DX) + VMOVDQU ·AVX2_c40<>+0(SB), Y4 + VMOVDQU ·AVX2_c48<>+0(SB), Y5 + VMOVDQU (AX), Y8 VMOVDQU 32(AX), Y9 - VMOVDQU ·AVX2_iv0<>(SB), Y6 - VMOVDQU ·AVX2_iv1<>(SB), Y7 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 - MOVQ R9, 8(DX) + VMOVDQU ·AVX2_iv0<>+0(SB), Y6 + VMOVDQU ·AVX2_iv1<>+0(SB), Y7 + MOVQ (BX), R8 + MOVQ 8(BX), R9 + MOVQ R9, 8(DX) loop: - ADDQ $128, R8 - MOVQ R8, 0(DX) - CMPQ R8, $128 + ADDQ $0x80, R8 + MOVQ R8, (DX) + CMPQ R8, $0x80 JGE noinc INCQ R9 MOVQ R9, 8(DX) noinc: - VMOVDQA Y8, Y0 - VMOVDQA Y9, Y1 - VMOVDQA Y6, Y2 - VPXOR 0(DX), Y7, Y3 - - LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() - VMOVDQA Y12, 32(DX) - VMOVDQA Y13, 64(DX) - VMOVDQA Y14, 96(DX) - VMOVDQA Y15, 128(DX) - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() - VMOVDQA Y12, 160(DX) - VMOVDQA Y13, 192(DX) - VMOVDQA Y14, 224(DX) - VMOVDQA Y15, 256(DX) - - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - - ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) - ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) - - VPXOR Y0, Y8, Y8 - VPXOR Y1, Y9, Y9 - VPXOR Y2, Y8, Y8 - VPXOR Y3, Y9, Y9 - - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop - - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - - VMOVDQU Y8, 0(AX) - VMOVDQU Y9, 32(AX) + VMOVDQA Y8, Y0 + VMOVDQA Y9, Y1 + VMOVDQA Y6, Y2 + VPXOR (DX), Y7, Y3 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 32(DX) + VMOVDQA Y13, 64(DX) + VMOVDQA Y14, 96(DX) + VMOVDQA Y15, 128(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x48 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + VPSHUFD $0x4e, (SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x28 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 160(DX) + VMOVDQA Y13, 192(DX) + VMOVDQA Y14, 224(DX) + VMOVDQA Y15, 256(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + VMOVDQU 88(SI), X12 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + VPSHUFD $0x4e, 64(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x10 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + VMOVDQU 96(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + VMOVDQU 32(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + VPSHUFD $0x4e, 40(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 32(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 64(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 96(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 128(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 160(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 192(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 224(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 256(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPXOR Y0, Y8, Y8 + VPXOR Y1, Y9, Y9 + VPXOR Y2, Y8, Y8 + VPXOR Y3, Y9, Y9 + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VMOVDQU Y8, (AX) + VMOVDQU Y9, 32(AX) VZEROUPPER - RET -#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA -#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB -#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF -#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD -#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE - -#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF -#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF - -#define SHUFFLE_AVX() \ - VMOVDQA X6, X13; \ - VMOVDQA X2, X14; \ - VMOVDQA X4, X6; \ - VPUNPCKLQDQ_X13_X13_X15; \ - VMOVDQA X5, X4; \ - VMOVDQA X6, X5; \ - VPUNPCKHQDQ_X15_X7_X6; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X13_X7; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VPUNPCKHQDQ_X15_X2_X2; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X3_X3; \ - -#define SHUFFLE_AVX_INV() \ - VMOVDQA X2, X13; \ - VMOVDQA X4, X14; \ - VPUNPCKLQDQ_X2_X2_X15; \ - VMOVDQA X5, X4; \ - VPUNPCKHQDQ_X15_X3_X2; \ - VMOVDQA X14, X5; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VMOVDQA X6, X14; \ - VPUNPCKHQDQ_X15_X13_X3; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X6_X6; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X7_X7; \ - -#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - VPADDQ m0, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m1, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFD $-79, v6, v6; \ - VPSHUFD $-79, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPSHUFB c40, v2, v2; \ - VPSHUFB c40, v3, v3; \ - VPADDQ m2, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m3, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFB c48, v6, v6; \ - VPSHUFB c48, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPADDQ v2, v2, t0; \ - VPSRLQ $63, v2, v2; \ - VPXOR t0, v2, v2; \ - VPADDQ v3, v3, t0; \ - VPSRLQ $63, v3, v3; \ - VPXOR t0, v3, v3 - -// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) -// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 -#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X13(i2*8); \ - VMOVQ_SI_X14(i4*8); \ - VMOVQ_SI_X15(i6*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X13(i3*8); \ - VPINSRQ_1_SI_X14(i5*8); \ - VPINSRQ_1_SI_X15(i7*8) - -// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) -#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(1*8); \ - VMOVQ_SI_X15(5*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X13(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(7*8) - -// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) -#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ - VPSHUFD $0x4E, 0*8(SI), X12; \ - VMOVQ_SI_X13(11*8); \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(7*8); \ - VPINSRQ_1_SI_X13(5*8); \ - VPINSRQ_1_SI_X14(2*8); \ - VPINSRQ_1_SI_X15(3*8) - -// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) -#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ - VMOVDQU 11*8(SI), X12; \ - VMOVQ_SI_X13(5*8); \ - VMOVQ_SI_X14(8*8); \ - VMOVQ_SI_X15(2*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14_0; \ - VPINSRQ_1_SI_X15(13*8) - -// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) -#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(6*8); \ - VMOVQ_SI_X15_0; \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(8*8) +DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) -#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ - VMOVQ_SI_X12(9*8); \ - VMOVQ_SI_X13(2*8); \ - VMOVQ_SI_X14_0; \ - VMOVQ_SI_X15(4*8); \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VPINSRQ_1_SI_X15(15*8) +DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) -#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(11*8); \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X13(8*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(3*8) +DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) -#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ - MOVQ 0*8(SI), X12; \ - VPSHUFD $0x4E, 8*8(SI), X13; \ - MOVQ 7*8(SI), X14; \ - MOVQ 2*8(SI), X15; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(11*8) - -// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) -#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ - MOVQ 6*8(SI), X12; \ - MOVQ 11*8(SI), X13; \ - MOVQ 15*8(SI), X14; \ - MOVQ 3*8(SI), X15; \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X14(9*8); \ - VPINSRQ_1_SI_X15(8*8) - -// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) -#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ - MOVQ 5*8(SI), X12; \ - MOVQ 8*8(SI), X13; \ - MOVQ 0*8(SI), X14; \ - MOVQ 6*8(SI), X15; \ - VPINSRQ_1_SI_X12(15*8); \ - VPINSRQ_1_SI_X13(2*8); \ - VPINSRQ_1_SI_X14(4*8); \ - VPINSRQ_1_SI_X15(10*8) - -// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) -#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ - VMOVDQU 12*8(SI), X12; \ - MOVQ 1*8(SI), X13; \ - MOVQ 2*8(SI), X14; \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VMOVDQU 4*8(SI), X15 - -// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) -#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ - MOVQ 15*8(SI), X12; \ - MOVQ 3*8(SI), X13; \ - MOVQ 11*8(SI), X14; \ - MOVQ 12*8(SI), X15; \ - VPINSRQ_1_SI_X12(9*8); \ - VPINSRQ_1_SI_X13(13*8); \ - VPINSRQ_1_SI_X14(14*8); \ - VPINSRQ_1_SI_X15_0 +DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f +DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - VMOVDQU ·AVX_c40<>(SB), X0 - VMOVDQU ·AVX_c48<>(SB), X1 +// Requires: AVX, SSE2 +TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + VMOVDQU ·AVX_c40<>+0(SB), X0 + VMOVDQU ·AVX_c48<>+0(SB), X1 VMOVDQA X0, X8 VMOVDQA X1, X9 - - VMOVDQU ·AVX_iv3<>(SB), X0 - VMOVDQA X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) - - VMOVDQU 0(AX), X10 + VMOVDQU ·AVX_iv3<>+0(SB), X0 + VMOVDQA X0, (R10) + XORQ CX, (R10) + VMOVDQU (AX), X10 VMOVDQU 16(AX), X11 VMOVDQU 32(AX), X2 VMOVDQU 48(AX), X3 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - VMOVQ_R8_X15 - VPINSRQ_1_R9_X15 - + BYTE $0xc4 + BYTE $0x41 + BYTE $0xf9 + BYTE $0x6e + BYTE $0xf8 + BYTE $0xc4 + BYTE $0x43 + BYTE $0x81 + BYTE $0x22 + BYTE $0xf9 + BYTE $0x01 VMOVDQA X10, X0 VMOVDQA X11, X1 - VMOVDQU ·AVX_iv0<>(SB), X4 - VMOVDQU ·AVX_iv1<>(SB), X5 - VMOVDQU ·AVX_iv2<>(SB), X6 - + VMOVDQU ·AVX_iv0<>+0(SB), X4 + VMOVDQU ·AVX_iv1<>+0(SB), X5 + VMOVDQU ·AVX_iv2<>+0(SB), X6 VPXOR X15, X6, X6 - VMOVDQA 0(R10), X7 - - LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA (R10), X7 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 VMOVDQA X12, 16(R10) VMOVDQA X13, 32(R10) VMOVDQA X14, 48(R10) VMOVDQA X15, 64(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 VMOVDQA X12, 80(R10) VMOVDQA X13, 96(R10) VMOVDQA X14, 112(R10) VMOVDQA X15, 128(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 VMOVDQA X12, 144(R10) VMOVDQA X13, 160(R10) VMOVDQA X14, 176(R10) VMOVDQA X15, 192(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPSHUFD $0x4e, (SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 VMOVDQA X12, 208(R10) VMOVDQA X13, 224(R10) VMOVDQA X14, 240(R10) VMOVDQA X15, 256(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_11_12_5_15_8_0_2_13() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_2_5_4_15_6_10_0_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_9_5_2_10_0_7_4_15() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_2_6_0_8_12_10_11_3() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_0_6_9_8_7_3_2_11() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_5_15_8_2_0_4_6_10() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_6_14_11_0_15_9_3_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_12_13_1_10_2_7_4_5() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_15_9_3_13_11_14_12_0() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VMOVDQU 88(SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x36 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ (SI), X12 + VPSHUFD $0x4e, 64(SI), X13 + MOVQ 56(SI), X14 + MOVQ 16(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 40(SI), X12 + MOVQ 64(SI), X13 + MOVQ (SI), X14 + MOVQ 48(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + MOVQ 48(SI), X12 + MOVQ 88(SI), X13 + MOVQ 120(SI), X14 + MOVQ 24(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VMOVDQU 96(SI), X12 + MOVQ 8(SI), X13 + MOVQ 16(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + VMOVDQU 32(SI), X15 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 120(SI), X12 + MOVQ 24(SI), X13 + MOVQ 88(SI), X14 + MOVQ 96(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x3e + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 16(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 32(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 48(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 64(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 80(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 96(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 112(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 128(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 144(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 160(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 176(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 192(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 208(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 224(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 240(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 256(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff VMOVDQU 32(AX), X14 VMOVDQU 48(AX), X15 VPXOR X0, X10, X10 @@ -729,16 +4524,36 @@ noinc: VPXOR X7, X15, X3 VMOVDQU X2, 32(AX) VMOVDQU X3, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + VMOVDQU X10, (AX) + VMOVDQU X11, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VZEROUPPER + RET - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16 - VMOVDQU X10, 0(AX) - VMOVDQU X11, 16(AX) +DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - VZEROUPPER +DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16 - RET +DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16 From 82942cf1d8d34067e576572f2e00014a78c1efd8 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Tue, 23 Jul 2024 16:15:38 -0400 Subject: [PATCH 35/57] blake2b: port blake2b_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="blake2b/blake2b_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I6dd59fb0b0365674aa5e43b69a57ea60fbcc4ba1 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600456 Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda --- blake2b/_asm/standard/blake2b_amd64_asm.go | 361 +++++ blake2b/_asm/standard/go.mod | 15 + blake2b/_asm/standard/go.sum | 12 + blake2b/blake2b_amd64.s | 1681 +++++++++++++++++--- 4 files changed, 1810 insertions(+), 259 deletions(-) create mode 100644 blake2b/_asm/standard/blake2b_amd64_asm.go create mode 100644 blake2b/_asm/standard/go.mod create mode 100644 blake2b/_asm/standard/go.sum diff --git a/blake2b/_asm/standard/blake2b_amd64_asm.go b/blake2b/_asm/standard/blake2b_amd64_asm.go new file mode 100644 index 0000000000..a34db3fca5 --- /dev/null +++ b/blake2b/_asm/standard/blake2b_amd64_asm.go @@ -0,0 +1,361 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/blake2b" +) + +//go:generate go run . -out ../../blake2b_amd64.s -pkg blake2b + +const ThatPeskyUnicodeDot = "\u00b7" + +var iv0_DATA_ptr, iv1_DATA_ptr, iv2_DATA_ptr, iv3_DATA_ptr, c40_DATA_ptr, c48_DATA_ptr *Mem + +func main() { + Package("golang.org/x/crypto/blake2b") + ConstraintExpr("amd64,gc,!purego") + hashBlocksSSE4() + Generate() +} + +func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v6, t1) + PUNPCKLQDQ(v6, t2) + PUNPCKHQDQ(v7, v6) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(v7, t2) + MOVO(t1, v7) + MOVO(v2, t1) + PUNPCKHQDQ(t2, v7) + PUNPCKLQDQ(v3, t2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v3) +} + +func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v2, t1) + PUNPCKLQDQ(v2, t2) + PUNPCKHQDQ(v3, v2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(v3, t2) + MOVO(t1, v3) + MOVO(v6, t1) + PUNPCKHQDQ(t2, v3) + PUNPCKLQDQ(v7, t2) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v7) +} + +func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7 VecPhysical, m0, m1, m2, m3 Op, t0, c40, c48 VecPhysical) { + PADDQ(m0, v0) + PADDQ(m1, v1) + PADDQ(v2, v0) + PADDQ(v3, v1) + PXOR(v0, v6) + PXOR(v1, v7) + PSHUFD(Imm(0xB1), v6, v6) + PSHUFD(Imm(0xB1), v7, v7) + PADDQ(v6, v4) + PADDQ(v7, v5) + PXOR(v4, v2) + PXOR(v5, v3) + PSHUFB(c40, v2) + PSHUFB(c40, v3) + PADDQ(m2, v0) + PADDQ(m3, v1) + PADDQ(v2, v0) + PADDQ(v3, v1) + PXOR(v0, v6) + PXOR(v1, v7) + PSHUFB(c48, v6) + PSHUFB(c48, v7) + PADDQ(v6, v4) + PADDQ(v7, v5) + PXOR(v4, v2) + PXOR(v5, v3) + MOVOU(v2, t0) + PADDQ(v2, t0) + PSRLQ(Imm(63), v2) + PXOR(t0, v2) + MOVOU(v3, t0) + PADDQ(v3, t0) + PSRLQ(Imm(63), v3) + PXOR(t0, v3) +} + +func LOAD_MSG(m0, m1, m2, m3 VecPhysical, src GPPhysical, i0, i1, i2, i3, i4, i5, i6, i7 int) { + MOVQ(Mem{Base: src}.Offset(i0*8), m0) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i1*8), m0) + MOVQ(Mem{Base: src}.Offset(i2*8), m1) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i3*8), m1) + MOVQ(Mem{Base: src}.Offset(i4*8), m2) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i5*8), m2) + MOVQ(Mem{Base: src}.Offset(i6*8), m3) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i7*8), m3) +} + +func hashBlocksSSE4() { + Implement("hashBlocksSSE4") + Attributes(4) + AllocLocal(288) // frame size = 272 + 16 byte alignment + + Load(Param("h"), RAX) + Load(Param("c"), RBX) + Load(Param("flag"), RCX) + Load(Param("blocks").Base(), RSI) + Load(Param("blocks").Len(), RDI) + + MOVQ(RSP, R10) + ADDQ(Imm(15), R10) + ANDQ(I32(-16), R10) + + iv3 := iv3_DATA() + MOVOU(iv3, X0) + MOVO(X0, Mem{Base: R10}.Offset(0)) + XORQ(RCX, Mem{Base: R10}.Offset(0)) // 0(R10) = ·iv3 ^ (CX || 0) + + c40 := c40_DATA() + c48 := c48_DATA() + MOVOU(c40, X13) + MOVOU(c48, X14) + + MOVOU(Mem{Base: AX}.Offset(0), X12) + MOVOU(Mem{Base: AX}.Offset(16), X15) + + MOVQ(Mem{Base: BX}.Offset(0), R8) + MOVQ(Mem{Base: BX}.Offset(8), R9) + + Label("loop") + ADDQ(Imm(128), R8) + CMPQ(R8, Imm(128)) + JGE(LabelRef("noinc")) + INCQ(R9) + + Label("noinc") + MOVQ(R8, X8) + PINSRQ(Imm(1), R9, X8) + + iv0 := iv0_DATA() + iv1 := iv1_DATA() + iv2 := iv2_DATA() + + MOVO(X12, X0) + MOVO(X15, X1) + MOVOU(Mem{Base: AX}.Offset(32), X2) + MOVOU(Mem{Base: AX}.Offset(48), X3) + MOVOU(iv0, X4) + MOVOU(iv1, X5) + MOVOU(iv2, X6) + + PXOR(X8, X6) + MOVO(Mem{Base: R10}.Offset(0), X7) + + LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) + MOVO(X8, Mem{Base: R10}.Offset(16)) + MOVO(X9, Mem{Base: R10}.Offset(32)) + MOVO(X10, Mem{Base: R10}.Offset(48)) + MOVO(X11, Mem{Base: R10}.Offset(64)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) + MOVO(X8, Mem{Base: R10}.Offset(80)) + MOVO(X9, Mem{Base: R10}.Offset(96)) + MOVO(X10, Mem{Base: R10}.Offset(112)) + MOVO(X11, Mem{Base: R10}.Offset(128)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) + MOVO(X8, Mem{Base: R10}.Offset(144)) + MOVO(X9, Mem{Base: R10}.Offset(160)) + MOVO(X10, Mem{Base: R10}.Offset(176)) + MOVO(X11, Mem{Base: R10}.Offset(192)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) + MOVO(X8, Mem{Base: R10}.Offset(208)) + MOVO(X9, Mem{Base: R10}.Offset(224)) + MOVO(X10, Mem{Base: R10}.Offset(240)) + MOVO(X11, Mem{Base: R10}.Offset(256)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(16), Mem{Base: R10}.Offset(32), Mem{Base: R10}.Offset(48), Mem{Base: R10}.Offset(64), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(80), Mem{Base: R10}.Offset(96), Mem{Base: R10}.Offset(112), Mem{Base: R10}.Offset(128), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(144), Mem{Base: R10}.Offset(160), Mem{Base: R10}.Offset(176), Mem{Base: R10}.Offset(192), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(208), Mem{Base: R10}.Offset(224), Mem{Base: R10}.Offset(240), Mem{Base: R10}.Offset(256), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + MOVOU(Mem{Base: AX}.Offset(32), X10) + MOVOU(Mem{Base: AX}.Offset(48), X11) + PXOR(X0, X12) + PXOR(X1, X15) + PXOR(X2, X10) + PXOR(X3, X11) + PXOR(X4, X12) + PXOR(X5, X15) + PXOR(X6, X10) + PXOR(X7, X11) + MOVOU(X10, Mem{Base: AX}.Offset(32)) + MOVOU(X11, Mem{Base: AX}.Offset(48)) + + LEAQ(Mem{Base: SI}.Offset(128), RSI) + SUBQ(Imm(128), RDI) + JNE(LabelRef("loop")) + + MOVOU(X12, Mem{Base: AX}.Offset(0)) + MOVOU(X15, Mem{Base: AX}.Offset(16)) + + MOVQ(R8, Mem{Base: BX}.Offset(0)) + MOVQ(R9, Mem{Base: BX}.Offset(8)) + + RET() +} + +// #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +func iv0_DATA() Mem { + if iv0_DATA_ptr != nil { + return *iv0_DATA_ptr + } + + iv0 := GLOBL(ThatPeskyUnicodeDot+"iv0", NOPTR|RODATA) + iv0_DATA_ptr = &iv0 + DATA(0x00, U64(0x6a09e667f3bcc908)) + DATA(0x08, U64(0xbb67ae8584caa73b)) + return iv0 +} + +func iv1_DATA() Mem { + if iv1_DATA_ptr != nil { + return *iv1_DATA_ptr + } + + iv1 := GLOBL(ThatPeskyUnicodeDot+"iv1", NOPTR|RODATA) + iv1_DATA_ptr = &iv1 + DATA(0x00, U64(0x3c6ef372fe94f82b)) + DATA(0x08, U64(0xa54ff53a5f1d36f1)) + return iv1 +} + +func iv2_DATA() Mem { + if iv2_DATA_ptr != nil { + return *iv2_DATA_ptr + } + + iv2 := GLOBL(ThatPeskyUnicodeDot+"iv2", NOPTR|RODATA) + iv2_DATA_ptr = &iv2 + DATA(0x00, U64(0x510e527fade682d1)) + DATA(0x08, U64(0x9b05688c2b3e6c1f)) + return iv2 +} + +func iv3_DATA() Mem { + if iv3_DATA_ptr != nil { + return *iv3_DATA_ptr + } + + iv3 := GLOBL(ThatPeskyUnicodeDot+"iv3", NOPTR|RODATA) + iv3_DATA_ptr = &iv3 + DATA(0x00, U64(0x1f83d9abfb41bd6b)) + DATA(0x08, U64(0x5be0cd19137e2179)) + return iv3 +} + +func c40_DATA() Mem { + if c40_DATA_ptr != nil { + return *c40_DATA_ptr + } + + c40 := GLOBL(ThatPeskyUnicodeDot+"c40", NOPTR|RODATA) + c40_DATA_ptr = &c40 + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + return c40 +} + +func c48_DATA() Mem { + if c48_DATA_ptr != nil { + return *c48_DATA_ptr + } + + c48 := GLOBL(ThatPeskyUnicodeDot+"c48", NOPTR|RODATA) + c48_DATA_ptr = &c48 + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + return c48 +} diff --git a/blake2b/_asm/standard/go.mod b/blake2b/_asm/standard/go.mod new file mode 100644 index 0000000000..8063f1b9c3 --- /dev/null +++ b/blake2b/_asm/standard/go.mod @@ -0,0 +1,15 @@ +module blake2b/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/blake2b/_asm/standard/go.sum b/blake2b/_asm/standard/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/blake2b/_asm/standard/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/blake2b/blake2b_amd64.s b/blake2b/blake2b_amd64.s index adfac00c15..9a0ce21244 100644 --- a/blake2b/blake2b_amd64.s +++ b/blake2b/blake2b_amd64.s @@ -1,278 +1,1441 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - PADDQ m0, v0; \ - PADDQ m1, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v6, v6; \ - PSHUFD $0xB1, v7, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - PSHUFB c40, v2; \ - PSHUFB c40, v3; \ - PADDQ m2, v0; \ - PADDQ m3, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFB c48, v6; \ - PSHUFB c48, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - MOVOU v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVOU v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ - MOVQ i0*8(src), m0; \ - PINSRQ $1, i1*8(src), m0; \ - MOVQ i2*8(src), m1; \ - PINSRQ $1, i3*8(src), m1; \ - MOVQ i4*8(src), m2; \ - PINSRQ $1, i5*8(src), m2; \ - MOVQ i6*8(src), m3; \ - PINSRQ $1, i7*8(src), m3 - // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - MOVOU ·iv3<>(SB), X0 - MOVO X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) - - MOVOU ·c40<>(SB), X13 - MOVOU ·c48<>(SB), X14 - - MOVOU 0(AX), X12 +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + MOVOU ·iv3<>+0(SB), X0 + MOVO X0, (R10) + XORQ CX, (R10) + MOVOU ·c40<>+0(SB), X13 + MOVOU ·c48<>+0(SB), X14 + MOVOU (AX), X12 MOVOU 16(AX), X15 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - MOVQ R8, X8 - PINSRQ $1, R9, X8 - - MOVO X12, X0 - MOVO X15, X1 - MOVOU 32(AX), X2 - MOVOU 48(AX), X3 - MOVOU ·iv0<>(SB), X4 - MOVOU ·iv1<>(SB), X5 - MOVOU ·iv2<>(SB), X6 - - PXOR X8, X6 - MOVO 0(R10), X7 - - LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) - MOVO X8, 16(R10) - MOVO X9, 32(R10) - MOVO X10, 48(R10) - MOVO X11, 64(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) - MOVO X8, 80(R10) - MOVO X9, 96(R10) - MOVO X10, 112(R10) - MOVO X11, 128(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) - MOVO X8, 144(R10) - MOVO X9, 160(R10) - MOVO X10, 176(R10) - MOVO X11, 192(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) - MOVO X8, 208(R10) - MOVO X9, 224(R10) - MOVO X10, 240(R10) - MOVO X11, 256(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + MOVQ R8, X8 + PINSRQ $0x01, R9, X8 + MOVO X12, X0 + MOVO X15, X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU ·iv0<>+0(SB), X4 + MOVOU ·iv1<>+0(SB), X5 + MOVOU ·iv2<>+0(SB), X6 + PXOR X8, X6 + MOVO (R10), X7 + MOVQ (SI), X8 + PINSRQ $0x01, 16(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 48(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 40(SI), X11 + PINSRQ $0x01, 56(SI), X11 + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 64(SI), X8 + PINSRQ $0x01, 80(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 112(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 88(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 120(SI), X11 + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 112(SI), X8 + PINSRQ $0x01, 32(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 80(SI), X10 + PINSRQ $0x01, 64(SI), X10 + MOVQ 120(SI), X11 + PINSRQ $0x01, 48(SI), X11 + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 8(SI), X8 + PINSRQ $0x01, (SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, 40(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 16(SI), X10 + MOVQ 56(SI), X11 + PINSRQ $0x01, 24(SI), X11 + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 88(SI), X8 + PINSRQ $0x01, 96(SI), X8 + MOVQ 40(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 64(SI), X10 + PINSRQ $0x01, (SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 80(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 72(SI), X9 + MOVQ 112(SI), X10 + PINSRQ $0x01, 48(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 32(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 56(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 104(SI), X9 + PINSRQ $0x01, 88(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 8(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, 112(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 16(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 48(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ (SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 72(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 16(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 120(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 112(SI), X8 + PINSRQ $0x01, 88(SI), X8 + MOVQ 48(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 96(SI), X10 + MOVQ 64(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 16(SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ (SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ 88(SI), X11 + PINSRQ $0x01, 24(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 32(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 120(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 104(SI), X10 + PINSRQ $0x01, 40(SI), X10 + MOVQ 112(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 96(SI), X8 + PINSRQ $0x01, 8(SI), X8 + MOVQ 112(SI), X9 + PINSRQ $0x01, 32(SI), X9 + MOVQ 40(SI), X10 + PINSRQ $0x01, 120(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ (SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 56(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 88(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 104(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 40(SI), X8 + PINSRQ $0x01, 120(SI), X8 + MOVQ 64(SI), X9 + PINSRQ $0x01, 16(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 48(SI), X8 + PINSRQ $0x01, 112(SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, (SI), X9 + MOVQ 120(SI), X10 + PINSRQ $0x01, 72(SI), X10 + MOVQ 24(SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 96(SI), X8 + PINSRQ $0x01, 104(SI), X8 + MOVQ 8(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 80(SI), X8 + PINSRQ $0x01, 64(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 120(SI), X8 + PINSRQ $0x01, 72(SI), X8 + MOVQ 24(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, (SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 16(R10), X0 + PADDQ 32(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 48(R10), X0 + PADDQ 64(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 80(R10), X0 + PADDQ 96(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 112(R10), X0 + PADDQ 128(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 144(R10), X0 + PADDQ 160(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 176(R10), X0 + PADDQ 192(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 208(R10), X0 + PADDQ 224(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 240(R10), X0 + PADDQ 256(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + PXOR X0, X12 + PXOR X1, X15 + PXOR X2, X10 + PXOR X3, X11 + PXOR X4, X12 + PXOR X5, X15 + PXOR X6, X10 + PXOR X7, X11 + MOVOU X10, 32(AX) + MOVOU X11, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVOU X12, (AX) + MOVOU X15, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + RET - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) +DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·iv3<>(SB), RODATA|NOPTR, $16 - MOVOU 32(AX), X10 - MOVOU 48(AX), X11 - PXOR X0, X12 - PXOR X1, X15 - PXOR X2, X10 - PXOR X3, X11 - PXOR X4, X12 - PXOR X5, X15 - PXOR X6, X10 - PXOR X7, X11 - MOVOU X10, 32(AX) - MOVOU X11, 48(AX) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 - MOVOU X12, 0(AX) - MOVOU X15, 16(AX) +DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·iv0<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) +DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·iv1<>(SB), RODATA|NOPTR, $16 - RET +DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·iv2<>(SB), RODATA|NOPTR, $16 From 620dfbc770bb652335dab79ae80f6c9bdb1a7321 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Fri, 26 Jul 2024 02:08:45 -0400 Subject: [PATCH 36/57] salsa20/salsa: Port salsa20_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE":salsa20/salsa/salsa20_amd64.s) \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ salsa20/salsa/salsa20_amd64.s \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: Ica0bb06f8b074ad566a979d33ddc81d8a38491b1 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601217 LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov --- salsa20/salsa/_asm/go.mod | 14 + salsa20/salsa/_asm/go.sum | 10 + salsa20/salsa/_asm/salsa20_amd64_asm.go | 932 ++++++++++++ salsa20/salsa/salsa20_amd64.s | 1742 +++++++++++------------ 4 files changed, 1827 insertions(+), 871 deletions(-) create mode 100644 salsa20/salsa/_asm/go.mod create mode 100644 salsa20/salsa/_asm/go.sum create mode 100644 salsa20/salsa/_asm/salsa20_amd64_asm.go diff --git a/salsa20/salsa/_asm/go.mod b/salsa20/salsa/_asm/go.mod new file mode 100644 index 0000000000..0cf7f76881 --- /dev/null +++ b/salsa20/salsa/_asm/go.mod @@ -0,0 +1,14 @@ +module salsa20/salsa/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/salsa20/salsa/_asm/go.sum b/salsa20/salsa/_asm/go.sum new file mode 100644 index 0000000000..e5970800fb --- /dev/null +++ b/salsa20/salsa/_asm/go.sum @@ -0,0 +1,10 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/salsa20/salsa/_asm/salsa20_amd64_asm.go b/salsa20/salsa/_asm/salsa20_amd64_asm.go new file mode 100644 index 0000000000..6546791c4c --- /dev/null +++ b/salsa20/salsa/_asm/salsa20_amd64_asm.go @@ -0,0 +1,932 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This code was translated into a form compatible with 6a from the public +// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/salsa20/salsa" +) + +//go:generate go run . -out ../salsa20_amd64.s -pkg salsa + +func main() { + Package("golang.org/x/crypto/salsa20/salsa") + ConstraintExpr("amd64,!purego,gc") + salsa2020XORKeyStream() + Generate() +} + +func salsa2020XORKeyStream() { + Implement("salsa2020XORKeyStream") + Attributes(0) + AllocLocal(456) // frame = 424 + 32 byte alignment + Comment("This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.") + + Load(Param("out"), RDI) + Load(Param("in"), RSI) + Load(Param("n"), RDX) + Load(Param("nonce"), RCX) + Load(Param("key"), R8) + + MOVQ(RSP, R12) + ADDQ(Imm(31), R12) + ANDQ(I32(^31), R12) + + MOVQ(RDX, R9) + MOVQ(RCX, RDX) + MOVQ(R8, R10) + CMPQ(R9, Imm(0)) + JBE(LabelRef("DONE")) + + START() + BYTESATLEAST256() + MAINLOOP1() + BYTESBETWEEN1AND255() + NOCOPY() + MAINLOOP2() + + Label("BYTESATLEAST64") + Label("DONE") + RET() + Label("BYTESATLEAST65") + SUBQ(Imm(64), R9) + ADDQ(Imm(64), RDI) + ADDQ(Imm(64), RSI) + JMP(LabelRef("BYTESBETWEEN1AND255")) +} + +func START() { + Label("START") + MOVL(Mem{Base: R10}.Offset(20), ECX) + MOVL(Mem{Base: R10}.Offset(0), R8L) + MOVL(Mem{Base: EDX}.Offset(0), EAX) + MOVL(Mem{Base: R10}.Offset(16), R11L) + MOVL(ECX, Mem{Base: R12}.Offset(0)) + MOVL(R8L, Mem{Base: R12}.Offset(4)) + MOVL(EAX, Mem{Base: R12}.Offset(8)) + MOVL(R11L, Mem{Base: R12}.Offset(12)) + MOVL(Mem{Base: EDX}.Offset(8), ECX) + MOVL(Mem{Base: R10}.Offset(24), R8L) + MOVL(Mem{Base: R10}.Offset(4), EAX) + MOVL(Mem{Base: EDX}.Offset(4), R11L) + MOVL(ECX, Mem{Base: R12}.Offset(16)) + MOVL(R8L, Mem{Base: R12}.Offset(20)) + MOVL(EAX, Mem{Base: R12}.Offset(24)) + MOVL(R11L, Mem{Base: R12}.Offset(28)) + MOVL(Mem{Base: EDX}.Offset(12), ECX) + MOVL(Mem{Base: R10}.Offset(12), EDX) + MOVL(Mem{Base: R10}.Offset(28), R8L) + MOVL(Mem{Base: R10}.Offset(8), EAX) + MOVL(EDX, Mem{Base: R12}.Offset(32)) + MOVL(ECX, Mem{Base: R12}.Offset(36)) + MOVL(R8L, Mem{Base: R12}.Offset(40)) + MOVL(EAX, Mem{Base: R12}.Offset(44)) + MOVQ(Imm(1634760805), RDX) + MOVQ(Imm(857760878), RCX) + MOVQ(Imm(2036477234), R8) + MOVQ(Imm(1797285236), RAX) + MOVL(EDX, Mem{Base: R12}.Offset(48)) + MOVL(ECX, Mem{Base: R12}.Offset(52)) + MOVL(R8L, Mem{Base: R12}.Offset(56)) + MOVL(EAX, Mem{Base: R12}.Offset(60)) + CMPQ(R9, U32(256)) + JB(LabelRef("BYTESBETWEEN1AND255")) + MOVOA(Mem{Base: R12}.Offset(48), X0) + PSHUFL(Imm(0x55), X0, X1) + PSHUFL(Imm(0xAA), X0, X2) + PSHUFL(Imm(0xFF), X0, X3) + PSHUFL(Imm(0x00), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(64)) + MOVOA(X2, Mem{Base: R12}.Offset(80)) + MOVOA(X3, Mem{Base: R12}.Offset(96)) + MOVOA(X0, Mem{Base: R12}.Offset(112)) + MOVOA(Mem{Base: R12}.Offset(0), X0) + PSHUFL(Imm(0xAA), X0, X1) + PSHUFL(Imm(0xFF), X0, X2) + PSHUFL(Imm(0x00), X0, X3) + PSHUFL(Imm(0x55), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(128)) + MOVOA(X2, Mem{Base: R12}.Offset(144)) + MOVOA(X3, Mem{Base: R12}.Offset(160)) + MOVOA(X0, Mem{Base: R12}.Offset(176)) + MOVOA(Mem{Base: R12}.Offset(16), X0) + PSHUFL(Imm(0xFF), X0, X1) + PSHUFL(Imm(0x55), X0, X2) + PSHUFL(Imm(0xAA), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(192)) + MOVOA(X2, Mem{Base: R12}.Offset(208)) + MOVOA(X0, Mem{Base: R12}.Offset(224)) + MOVOA(Mem{Base: R12}.Offset(32), X0) + PSHUFL(Imm(0x00), X0, X1) + PSHUFL(Imm(0xAA), X0, X2) + PSHUFL(Imm(0xFF), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(240)) + MOVOA(X2, Mem{Base: R12}.Offset(256)) + MOVOA(X0, Mem{Base: R12}.Offset(272)) + +} + +func BYTESATLEAST256() { + Label("BYTESATLEAST256") + MOVL(Mem{Base: R12}.Offset(16), EDX) + MOVL(Mem{Base: R12}.Offset(36), ECX) + MOVL(EDX, Mem{Base: R12}.Offset(288)) + MOVL(ECX, Mem{Base: R12}.Offset(304)) + SHLQ(Imm(32), RCX) + ADDQ(RCX, RDX) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(292)) + MOVL(ECX, Mem{Base: R12}.Offset(308)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(296)) + MOVL(ECX, Mem{Base: R12}.Offset(312)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(300)) + MOVL(ECX, Mem{Base: R12}.Offset(316)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(16)) + MOVL(ECX, Mem{Base: R12}.Offset(36)) + MOVQ(R9, Mem{Base: R12}.Offset(352)) + MOVQ(U32(20), RDX) + MOVOA(Mem{Base: R12}.Offset(64), X0) + MOVOA(Mem{Base: R12}.Offset(80), X1) + MOVOA(Mem{Base: R12}.Offset(96), X2) + MOVOA(Mem{Base: R12}.Offset(256), X3) + MOVOA(Mem{Base: R12}.Offset(272), X4) + MOVOA(Mem{Base: R12}.Offset(128), X5) + MOVOA(Mem{Base: R12}.Offset(144), X6) + MOVOA(Mem{Base: R12}.Offset(176), X7) + MOVOA(Mem{Base: R12}.Offset(192), X8) + MOVOA(Mem{Base: R12}.Offset(208), X9) + MOVOA(Mem{Base: R12}.Offset(224), X10) + MOVOA(Mem{Base: R12}.Offset(304), X11) + MOVOA(Mem{Base: R12}.Offset(112), X12) + MOVOA(Mem{Base: R12}.Offset(160), X13) + MOVOA(Mem{Base: R12}.Offset(240), X14) + MOVOA(Mem{Base: R12}.Offset(288), X15) +} + +func MAINLOOP1() { + Label("MAINLOOP1") + MOVOA(X1, Mem{Base: R12}.Offset(320)) + MOVOA(X2, Mem{Base: R12}.Offset(336)) + MOVOA(X13, X1) + PADDL(X12, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X14) + PSRLL(Imm(25), X2) + PXOR(X2, X14) + MOVOA(X7, X1) + PADDL(X0, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X11) + PSRLL(Imm(25), X2) + PXOR(X2, X11) + MOVOA(X12, X1) + PADDL(X14, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X15) + PSRLL(Imm(23), X2) + PXOR(X2, X15) + MOVOA(X0, X1) + PADDL(X11, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X9) + PSRLL(Imm(23), X2) + PXOR(X2, X9) + MOVOA(X14, X1) + PADDL(X15, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X13) + PSRLL(Imm(19), X2) + PXOR(X2, X13) + MOVOA(X11, X1) + PADDL(X9, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X7) + PSRLL(Imm(19), X2) + PXOR(X2, X7) + MOVOA(X15, X1) + PADDL(X13, X1) + MOVOA(X1, X2) + PSLLL(Imm(18), X1) + PXOR(X1, X12) + PSRLL(Imm(14), X2) + PXOR(X2, X12) + MOVOA(Mem{Base: R12}.Offset(320), X1) + MOVOA(X12, Mem{Base: R12}.Offset(320)) + MOVOA(X9, X2) + PADDL(X7, X2) + MOVOA(X2, X12) + PSLLL(Imm(18), X2) + PXOR(X2, X0) + PSRLL(Imm(14), X12) + PXOR(X12, X0) + MOVOA(X5, X2) + PADDL(X1, X2) + MOVOA(X2, X12) + PSLLL(Imm(7), X2) + PXOR(X2, X3) + PSRLL(Imm(25), X12) + PXOR(X12, X3) + MOVOA(Mem{Base: R12}.Offset(336), X2) + MOVOA(X0, Mem{Base: R12}.Offset(336)) + MOVOA(X6, X0) + PADDL(X2, X0) + MOVOA(X0, X12) + PSLLL(Imm(7), X0) + PXOR(X0, X4) + PSRLL(Imm(25), X12) + PXOR(X12, X4) + MOVOA(X1, X0) + PADDL(X3, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X10) + PSRLL(Imm(23), X12) + PXOR(X12, X10) + MOVOA(X2, X0) + PADDL(X4, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X8) + PSRLL(Imm(23), X12) + PXOR(X12, X8) + MOVOA(X3, X0) + PADDL(X10, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X5) + PSRLL(Imm(19), X12) + PXOR(X12, X5) + MOVOA(X4, X0) + PADDL(X8, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X6) + PSRLL(Imm(19), X12) + PXOR(X12, X6) + MOVOA(X10, X0) + PADDL(X5, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X1) + PSRLL(Imm(14), X12) + PXOR(X12, X1) + MOVOA(Mem{Base: R12}.Offset(320), X0) + MOVOA(X1, Mem{Base: R12}.Offset(320)) + MOVOA(X4, X1) + PADDL(X0, X1) + MOVOA(X1, X12) + PSLLL(Imm(7), X1) + PXOR(X1, X7) + PSRLL(Imm(25), X12) + PXOR(X12, X7) + MOVOA(X8, X1) + PADDL(X6, X1) + MOVOA(X1, X12) + PSLLL(Imm(18), X1) + PXOR(X1, X2) + PSRLL(Imm(14), X12) + PXOR(X12, X2) + MOVOA(Mem{Base: R12}.Offset(336), X12) + MOVOA(X2, Mem{Base: R12}.Offset(336)) + MOVOA(X14, X1) + PADDL(X12, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X5) + PSRLL(Imm(25), X2) + PXOR(X2, X5) + MOVOA(X0, X1) + PADDL(X7, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X10) + PSRLL(Imm(23), X2) + PXOR(X2, X10) + MOVOA(X12, X1) + PADDL(X5, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X8) + PSRLL(Imm(23), X2) + PXOR(X2, X8) + MOVOA(X7, X1) + PADDL(X10, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X4) + PSRLL(Imm(19), X2) + PXOR(X2, X4) + MOVOA(X5, X1) + PADDL(X8, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X14) + PSRLL(Imm(19), X2) + PXOR(X2, X14) + MOVOA(X10, X1) + PADDL(X4, X1) + MOVOA(X1, X2) + PSLLL(Imm(18), X1) + PXOR(X1, X0) + PSRLL(Imm(14), X2) + PXOR(X2, X0) + MOVOA(Mem{Base: R12}.Offset(320), X1) + MOVOA(X0, Mem{Base: R12}.Offset(320)) + MOVOA(X8, X0) + PADDL(X14, X0) + MOVOA(X0, X2) + PSLLL(Imm(18), X0) + PXOR(X0, X12) + PSRLL(Imm(14), X2) + PXOR(X2, X12) + MOVOA(X11, X0) + PADDL(X1, X0) + MOVOA(X0, X2) + PSLLL(Imm(7), X0) + PXOR(X0, X6) + PSRLL(Imm(25), X2) + PXOR(X2, X6) + MOVOA(Mem{Base: R12}.Offset(336), X2) + MOVOA(X12, Mem{Base: R12}.Offset(336)) + MOVOA(X3, X0) + PADDL(X2, X0) + MOVOA(X0, X12) + PSLLL(Imm(7), X0) + PXOR(X0, X13) + PSRLL(Imm(25), X12) + PXOR(X12, X13) + MOVOA(X1, X0) + PADDL(X6, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X15) + PSRLL(Imm(23), X12) + PXOR(X12, X15) + MOVOA(X2, X0) + PADDL(X13, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X9) + PSRLL(Imm(23), X12) + PXOR(X12, X9) + MOVOA(X6, X0) + PADDL(X15, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X11) + PSRLL(Imm(19), X12) + PXOR(X12, X11) + MOVOA(X13, X0) + PADDL(X9, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X3) + PSRLL(Imm(19), X12) + PXOR(X12, X3) + MOVOA(X15, X0) + PADDL(X11, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X1) + PSRLL(Imm(14), X12) + PXOR(X12, X1) + MOVOA(X9, X0) + PADDL(X3, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X2) + PSRLL(Imm(14), X12) + PXOR(X12, X2) + MOVOA(Mem{Base: R12}.Offset(320), X12) + MOVOA(Mem{Base: R12}.Offset(336), X0) + SUBQ(Imm(2), RDX) + JA(LabelRef("MAINLOOP1")) + PADDL(Mem{Base: R12}.Offset(112), X12) + PADDL(Mem{Base: R12}.Offset(176), X7) + PADDL(Mem{Base: R12}.Offset(224), X10) + PADDL(Mem{Base: R12}.Offset(272), X4) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(0), EDX) + XORL(Mem{Base: SI}.Offset(4), ECX) + XORL(Mem{Base: SI}.Offset(8), R8L) + XORL(Mem{Base: SI}.Offset(12), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(0)) + MOVL(ECX, Mem{Base: DI}.Offset(4)) + MOVL(R8L, Mem{Base: DI}.Offset(8)) + MOVL(R9L, Mem{Base: DI}.Offset(12)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(64), EDX) + XORL(Mem{Base: SI}.Offset(68), ECX) + XORL(Mem{Base: SI}.Offset(72), R8L) + XORL(Mem{Base: SI}.Offset(76), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(64)) + MOVL(ECX, Mem{Base: DI}.Offset(68)) + MOVL(R8L, Mem{Base: DI}.Offset(72)) + MOVL(R9L, Mem{Base: DI}.Offset(76)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(128), EDX) + XORL(Mem{Base: SI}.Offset(132), ECX) + XORL(Mem{Base: SI}.Offset(136), R8L) + XORL(Mem{Base: SI}.Offset(140), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(128)) + MOVL(ECX, Mem{Base: DI}.Offset(132)) + MOVL(R8L, Mem{Base: DI}.Offset(136)) + MOVL(R9L, Mem{Base: DI}.Offset(140)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + XORL(Mem{Base: SI}.Offset(192), EDX) + XORL(Mem{Base: SI}.Offset(196), ECX) + XORL(Mem{Base: SI}.Offset(200), R8L) + XORL(Mem{Base: SI}.Offset(204), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(192)) + MOVL(ECX, Mem{Base: DI}.Offset(196)) + MOVL(R8L, Mem{Base: DI}.Offset(200)) + MOVL(R9L, Mem{Base: DI}.Offset(204)) + PADDL(Mem{Base: R12}.Offset(240), X14) + PADDL(Mem{Base: R12}.Offset(64), X0) + PADDL(Mem{Base: R12}.Offset(128), X5) + PADDL(Mem{Base: R12}.Offset(192), X8) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(16), EDX) + XORL(Mem{Base: SI}.Offset(20), ECX) + XORL(Mem{Base: SI}.Offset(24), R8L) + XORL(Mem{Base: SI}.Offset(28), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(16)) + MOVL(ECX, Mem{Base: DI}.Offset(20)) + MOVL(R8L, Mem{Base: DI}.Offset(24)) + MOVL(R9L, Mem{Base: DI}.Offset(28)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(80), EDX) + XORL(Mem{Base: SI}.Offset(84), ECX) + XORL(Mem{Base: SI}.Offset(88), R8L) + XORL(Mem{Base: SI}.Offset(92), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(80)) + MOVL(ECX, Mem{Base: DI}.Offset(84)) + MOVL(R8L, Mem{Base: DI}.Offset(88)) + MOVL(R9L, Mem{Base: DI}.Offset(92)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(144), EDX) + XORL(Mem{Base: SI}.Offset(148), ECX) + XORL(Mem{Base: SI}.Offset(152), R8L) + XORL(Mem{Base: SI}.Offset(156), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(144)) + MOVL(ECX, Mem{Base: DI}.Offset(148)) + MOVL(R8L, Mem{Base: DI}.Offset(152)) + MOVL(R9L, Mem{Base: DI}.Offset(156)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + XORL(Mem{Base: SI}.Offset(208), EDX) + XORL(Mem{Base: SI}.Offset(212), ECX) + XORL(Mem{Base: SI}.Offset(216), R8L) + XORL(Mem{Base: SI}.Offset(220), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(208)) + MOVL(ECX, Mem{Base: DI}.Offset(212)) + MOVL(R8L, Mem{Base: DI}.Offset(216)) + MOVL(R9L, Mem{Base: DI}.Offset(220)) + PADDL(Mem{Base: R12}.Offset(288), X15) + PADDL(Mem{Base: R12}.Offset(304), X11) + PADDL(Mem{Base: R12}.Offset(80), X1) + PADDL(Mem{Base: R12}.Offset(144), X6) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(32), EDX) + XORL(Mem{Base: SI}.Offset(36), ECX) + XORL(Mem{Base: SI}.Offset(40), R8L) + XORL(Mem{Base: SI}.Offset(44), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(32)) + MOVL(ECX, Mem{Base: DI}.Offset(36)) + MOVL(R8L, Mem{Base: DI}.Offset(40)) + MOVL(R9L, Mem{Base: DI}.Offset(44)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(96), EDX) + XORL(Mem{Base: SI}.Offset(100), ECX) + XORL(Mem{Base: SI}.Offset(104), R8L) + XORL(Mem{Base: SI}.Offset(108), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(96)) + MOVL(ECX, Mem{Base: DI}.Offset(100)) + MOVL(R8L, Mem{Base: DI}.Offset(104)) + MOVL(R9L, Mem{Base: DI}.Offset(108)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(160), EDX) + XORL(Mem{Base: SI}.Offset(164), ECX) + XORL(Mem{Base: SI}.Offset(168), R8L) + XORL(Mem{Base: SI}.Offset(172), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(160)) + MOVL(ECX, Mem{Base: DI}.Offset(164)) + MOVL(R8L, Mem{Base: DI}.Offset(168)) + MOVL(R9L, Mem{Base: DI}.Offset(172)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + XORL(Mem{Base: SI}.Offset(224), EDX) + XORL(Mem{Base: SI}.Offset(228), ECX) + XORL(Mem{Base: SI}.Offset(232), R8L) + XORL(Mem{Base: SI}.Offset(236), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(224)) + MOVL(ECX, Mem{Base: DI}.Offset(228)) + MOVL(R8L, Mem{Base: DI}.Offset(232)) + MOVL(R9L, Mem{Base: DI}.Offset(236)) + PADDL(Mem{Base: R12}.Offset(160), X13) + PADDL(Mem{Base: R12}.Offset(208), X9) + PADDL(Mem{Base: R12}.Offset(256), X3) + PADDL(Mem{Base: R12}.Offset(96), X2) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(48), EDX) + XORL(Mem{Base: SI}.Offset(52), ECX) + XORL(Mem{Base: SI}.Offset(56), R8L) + XORL(Mem{Base: SI}.Offset(60), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(48)) + MOVL(ECX, Mem{Base: DI}.Offset(52)) + MOVL(R8L, Mem{Base: DI}.Offset(56)) + MOVL(R9L, Mem{Base: DI}.Offset(60)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(112), EDX) + XORL(Mem{Base: SI}.Offset(116), ECX) + XORL(Mem{Base: SI}.Offset(120), R8L) + XORL(Mem{Base: SI}.Offset(124), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(112)) + MOVL(ECX, Mem{Base: DI}.Offset(116)) + MOVL(R8L, Mem{Base: DI}.Offset(120)) + MOVL(R9L, Mem{Base: DI}.Offset(124)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(176), EDX) + XORL(Mem{Base: SI}.Offset(180), ECX) + XORL(Mem{Base: SI}.Offset(184), R8L) + XORL(Mem{Base: SI}.Offset(188), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(176)) + MOVL(ECX, Mem{Base: DI}.Offset(180)) + MOVL(R8L, Mem{Base: DI}.Offset(184)) + MOVL(R9L, Mem{Base: DI}.Offset(188)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + XORL(Mem{Base: SI}.Offset(240), EDX) + XORL(Mem{Base: SI}.Offset(244), ECX) + XORL(Mem{Base: SI}.Offset(248), R8L) + XORL(Mem{Base: SI}.Offset(252), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(240)) + MOVL(ECX, Mem{Base: DI}.Offset(244)) + MOVL(R8L, Mem{Base: DI}.Offset(248)) + MOVL(R9L, Mem{Base: DI}.Offset(252)) + MOVQ(Mem{Base: R12}.Offset(352), R9) + SUBQ(U32(256), R9) + ADDQ(U32(256), RSI) + ADDQ(U32(256), RDI) + CMPQ(R9, U32(256)) + JAE(LabelRef("BYTESATLEAST256")) + CMPQ(R9, Imm(0)) + JBE(LabelRef("DONE")) +} + +func BYTESBETWEEN1AND255() { + Label("BYTESBETWEEN1AND255") + CMPQ(R9, Imm(64)) + JAE(LabelRef("NOCOPY")) + MOVQ(RDI, RDX) + LEAQ(Mem{Base: R12}.Offset(360), RDI) + MOVQ(R9, RCX) + // Hack to get Avo to emit: + // REP; MOVSB + Instruction(&ir.Instruction{Opcode: "REP; MOVSB"}) + LEAQ(Mem{Base: R12}.Offset(360), RDI) + LEAQ(Mem{Base: R12}.Offset(360), RSI) +} + +func NOCOPY() { + Label("NOCOPY") + MOVQ(R9, Mem{Base: R12}.Offset(352)) + MOVOA(Mem{Base: R12}.Offset(48), X0) + MOVOA(Mem{Base: R12}.Offset(0), X1) + MOVOA(Mem{Base: R12}.Offset(16), X2) + MOVOA(Mem{Base: R12}.Offset(32), X3) + MOVOA(X1, X4) + MOVQ(U32(20), RCX) +} + +func MAINLOOP2() { + Label("MAINLOOP2") + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X3) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X3, X3) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X1) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X1, X1) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X1) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X1, X1) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X3) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X3, X3) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X3) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X3, X3) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X1) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X1, X1) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X1) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X1, X1) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X3) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X3) + SUBQ(Imm(4), RCX) + PADDL(X3, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PXOR(X7, X7) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X3, X3) + PXOR(X6, X0) + JA(LabelRef("MAINLOOP2")) + PADDL(Mem{Base: R12}.Offset(48), X0) + PADDL(Mem{Base: R12}.Offset(0), X1) + PADDL(Mem{Base: R12}.Offset(16), X2) + PADDL(Mem{Base: R12}.Offset(32), X3) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(0), ECX) + XORL(Mem{Base: SI}.Offset(48), R8L) + XORL(Mem{Base: SI}.Offset(32), R9L) + XORL(Mem{Base: SI}.Offset(16), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(0)) + MOVL(R8L, Mem{Base: DI}.Offset(48)) + MOVL(R9L, Mem{Base: DI}.Offset(32)) + MOVL(EAX, Mem{Base: DI}.Offset(16)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(20), ECX) + XORL(Mem{Base: SI}.Offset(4), R8L) + XORL(Mem{Base: SI}.Offset(52), R9L) + XORL(Mem{Base: SI}.Offset(36), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(20)) + MOVL(R8L, Mem{Base: DI}.Offset(4)) + MOVL(R9L, Mem{Base: DI}.Offset(52)) + MOVL(EAX, Mem{Base: DI}.Offset(36)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(40), ECX) + XORL(Mem{Base: SI}.Offset(24), R8L) + XORL(Mem{Base: SI}.Offset(8), R9L) + XORL(Mem{Base: SI}.Offset(56), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(40)) + MOVL(R8L, Mem{Base: DI}.Offset(24)) + MOVL(R9L, Mem{Base: DI}.Offset(8)) + MOVL(EAX, Mem{Base: DI}.Offset(56)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + XORL(Mem{Base: SI}.Offset(60), ECX) + XORL(Mem{Base: SI}.Offset(44), R8L) + XORL(Mem{Base: SI}.Offset(28), R9L) + XORL(Mem{Base: SI}.Offset(12), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(60)) + MOVL(R8L, Mem{Base: DI}.Offset(44)) + MOVL(R9L, Mem{Base: DI}.Offset(28)) + MOVL(EAX, Mem{Base: DI}.Offset(12)) + MOVQ(Mem{Base: R12}.Offset(352), R9) + MOVL(Mem{Base: R12}.Offset(16), ECX) + MOVL(Mem{Base: R12}.Offset(36), R8L) + ADDQ(Imm(1), RCX) + SHLQ(Imm(32), R8) + ADDQ(R8, RCX) + MOVQ(RCX, R8) + SHRQ(Imm(32), R8) + MOVL(ECX, Mem{Base: R12}.Offset(16)) + MOVL(R8L, Mem{Base: R12}.Offset(36)) + CMPQ(R9, Imm(64)) + JA(LabelRef("BYTESATLEAST65")) + JAE(LabelRef("BYTESATLEAST64")) + MOVQ(RDI, RSI) + MOVQ(RDX, RDI) + MOVQ(R9, RCX) + // Hack to get Avo to emit: + // REP; MOVSB + Instruction(&ir.Instruction{Opcode: "REP; MOVSB"}) +} diff --git a/salsa20/salsa/salsa20_amd64.s b/salsa20/salsa/salsa20_amd64.s index fcce0234b6..3883e0ec22 100644 --- a/salsa20/salsa/salsa20_amd64.s +++ b/salsa20/salsa/salsa20_amd64.s @@ -1,880 +1,880 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT. //go:build amd64 && !purego && gc -// This code was translated into a form compatible with 6a from the public -// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html +// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte) +// Requires: SSE2 +TEXT ·salsa2020XORKeyStream(SB), $456-40 + // This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. + MOVQ out+0(FP), DI + MOVQ in+8(FP), SI + MOVQ n+16(FP), DX + MOVQ nonce+24(FP), CX + MOVQ key+32(FP), R8 + MOVQ SP, R12 + ADDQ $0x1f, R12 + ANDQ $-32, R12 + MOVQ DX, R9 + MOVQ CX, DX + MOVQ R8, R10 + CMPQ R9, $0x00 + JBE DONE + MOVL 20(R10), CX + MOVL (R10), R8 + MOVL (DX), AX + MOVL 16(R10), R11 + MOVL CX, (R12) + MOVL R8, 4(R12) + MOVL AX, 8(R12) + MOVL R11, 12(R12) + MOVL 8(DX), CX + MOVL 24(R10), R8 + MOVL 4(R10), AX + MOVL 4(DX), R11 + MOVL CX, 16(R12) + MOVL R8, 20(R12) + MOVL AX, 24(R12) + MOVL R11, 28(R12) + MOVL 12(DX), CX + MOVL 12(R10), DX + MOVL 28(R10), R8 + MOVL 8(R10), AX + MOVL DX, 32(R12) + MOVL CX, 36(R12) + MOVL R8, 40(R12) + MOVL AX, 44(R12) + MOVQ $0x61707865, DX + MOVQ $0x3320646e, CX + MOVQ $0x79622d32, R8 + MOVQ $0x6b206574, AX + MOVL DX, 48(R12) + MOVL CX, 52(R12) + MOVL R8, 56(R12) + MOVL AX, 60(R12) + CMPQ R9, $0x00000100 + JB BYTESBETWEEN1AND255 + MOVOA 48(R12), X0 + PSHUFL $0x55, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X3 + PSHUFL $0x00, X0, X0 + MOVOA X1, 64(R12) + MOVOA X2, 80(R12) + MOVOA X3, 96(R12) + MOVOA X0, 112(R12) + MOVOA (R12), X0 + PSHUFL $0xaa, X0, X1 + PSHUFL $0xff, X0, X2 + PSHUFL $0x00, X0, X3 + PSHUFL $0x55, X0, X0 + MOVOA X1, 128(R12) + MOVOA X2, 144(R12) + MOVOA X3, 160(R12) + MOVOA X0, 176(R12) + MOVOA 16(R12), X0 + PSHUFL $0xff, X0, X1 + PSHUFL $0x55, X0, X2 + PSHUFL $0xaa, X0, X0 + MOVOA X1, 192(R12) + MOVOA X2, 208(R12) + MOVOA X0, 224(R12) + MOVOA 32(R12), X0 + PSHUFL $0x00, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X0 + MOVOA X1, 240(R12) + MOVOA X2, 256(R12) + MOVOA X0, 272(R12) -// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte) -// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. -TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment - MOVQ out+0(FP),DI - MOVQ in+8(FP),SI - MOVQ n+16(FP),DX - MOVQ nonce+24(FP),CX - MOVQ key+32(FP),R8 +BYTESATLEAST256: + MOVL 16(R12), DX + MOVL 36(R12), CX + MOVL DX, 288(R12) + MOVL CX, 304(R12) + SHLQ $0x20, CX + ADDQ CX, DX + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 292(R12) + MOVL CX, 308(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 296(R12) + MOVL CX, 312(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 300(R12) + MOVL CX, 316(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 16(R12) + MOVL CX, 36(R12) + MOVQ R9, 352(R12) + MOVQ $0x00000014, DX + MOVOA 64(R12), X0 + MOVOA 80(R12), X1 + MOVOA 96(R12), X2 + MOVOA 256(R12), X3 + MOVOA 272(R12), X4 + MOVOA 128(R12), X5 + MOVOA 144(R12), X6 + MOVOA 176(R12), X7 + MOVOA 192(R12), X8 + MOVOA 208(R12), X9 + MOVOA 224(R12), X10 + MOVOA 304(R12), X11 + MOVOA 112(R12), X12 + MOVOA 160(R12), X13 + MOVOA 240(R12), X14 + MOVOA 288(R12), X15 - MOVQ SP,R12 - ADDQ $31, R12 - ANDQ $~31, R12 +MAINLOOP1: + MOVOA X1, 320(R12) + MOVOA X2, 336(R12) + MOVOA X13, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X14 + PSRLL $0x19, X2 + PXOR X2, X14 + MOVOA X7, X1 + PADDL X0, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X11 + PSRLL $0x19, X2 + PXOR X2, X11 + MOVOA X12, X1 + PADDL X14, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X15 + PSRLL $0x17, X2 + PXOR X2, X15 + MOVOA X0, X1 + PADDL X11, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X9 + PSRLL $0x17, X2 + PXOR X2, X9 + MOVOA X14, X1 + PADDL X15, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X13 + PSRLL $0x13, X2 + PXOR X2, X13 + MOVOA X11, X1 + PADDL X9, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X7 + PSRLL $0x13, X2 + PXOR X2, X7 + MOVOA X15, X1 + PADDL X13, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA 320(R12), X1 + MOVOA X12, 320(R12) + MOVOA X9, X2 + PADDL X7, X2 + MOVOA X2, X12 + PSLLL $0x12, X2 + PXOR X2, X0 + PSRLL $0x0e, X12 + PXOR X12, X0 + MOVOA X5, X2 + PADDL X1, X2 + MOVOA X2, X12 + PSLLL $0x07, X2 + PXOR X2, X3 + PSRLL $0x19, X12 + PXOR X12, X3 + MOVOA 336(R12), X2 + MOVOA X0, 336(R12) + MOVOA X6, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X4 + PSRLL $0x19, X12 + PXOR X12, X4 + MOVOA X1, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X10 + PSRLL $0x17, X12 + PXOR X12, X10 + MOVOA X2, X0 + PADDL X4, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X8 + PSRLL $0x17, X12 + PXOR X12, X8 + MOVOA X3, X0 + PADDL X10, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X5 + PSRLL $0x13, X12 + PXOR X12, X5 + MOVOA X4, X0 + PADDL X8, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X6 + PSRLL $0x13, X12 + PXOR X12, X6 + MOVOA X10, X0 + PADDL X5, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA 320(R12), X0 + MOVOA X1, 320(R12) + MOVOA X4, X1 + PADDL X0, X1 + MOVOA X1, X12 + PSLLL $0x07, X1 + PXOR X1, X7 + PSRLL $0x19, X12 + PXOR X12, X7 + MOVOA X8, X1 + PADDL X6, X1 + MOVOA X1, X12 + PSLLL $0x12, X1 + PXOR X1, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 336(R12), X12 + MOVOA X2, 336(R12) + MOVOA X14, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X5 + PSRLL $0x19, X2 + PXOR X2, X5 + MOVOA X0, X1 + PADDL X7, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X10 + PSRLL $0x17, X2 + PXOR X2, X10 + MOVOA X12, X1 + PADDL X5, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X8 + PSRLL $0x17, X2 + PXOR X2, X8 + MOVOA X7, X1 + PADDL X10, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X4 + PSRLL $0x13, X2 + PXOR X2, X4 + MOVOA X5, X1 + PADDL X8, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X14 + PSRLL $0x13, X2 + PXOR X2, X14 + MOVOA X10, X1 + PADDL X4, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X0 + PSRLL $0x0e, X2 + PXOR X2, X0 + MOVOA 320(R12), X1 + MOVOA X0, 320(R12) + MOVOA X8, X0 + PADDL X14, X0 + MOVOA X0, X2 + PSLLL $0x12, X0 + PXOR X0, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA X11, X0 + PADDL X1, X0 + MOVOA X0, X2 + PSLLL $0x07, X0 + PXOR X0, X6 + PSRLL $0x19, X2 + PXOR X2, X6 + MOVOA 336(R12), X2 + MOVOA X12, 336(R12) + MOVOA X3, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X13 + PSRLL $0x19, X12 + PXOR X12, X13 + MOVOA X1, X0 + PADDL X6, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X15 + PSRLL $0x17, X12 + PXOR X12, X15 + MOVOA X2, X0 + PADDL X13, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X9 + PSRLL $0x17, X12 + PXOR X12, X9 + MOVOA X6, X0 + PADDL X15, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X11 + PSRLL $0x13, X12 + PXOR X12, X11 + MOVOA X13, X0 + PADDL X9, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X3 + PSRLL $0x13, X12 + PXOR X12, X3 + MOVOA X15, X0 + PADDL X11, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA X9, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 320(R12), X12 + MOVOA 336(R12), X0 + SUBQ $0x02, DX + JA MAINLOOP1 + PADDL 112(R12), X12 + PADDL 176(R12), X7 + PADDL 224(R12), X10 + PADDL 272(R12), X4 + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL (SI), DX + XORL 4(SI), CX + XORL 8(SI), R8 + XORL 12(SI), R9 + MOVL DX, (DI) + MOVL CX, 4(DI) + MOVL R8, 8(DI) + MOVL R9, 12(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 64(SI), DX + XORL 68(SI), CX + XORL 72(SI), R8 + XORL 76(SI), R9 + MOVL DX, 64(DI) + MOVL CX, 68(DI) + MOVL R8, 72(DI) + MOVL R9, 76(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 128(SI), DX + XORL 132(SI), CX + XORL 136(SI), R8 + XORL 140(SI), R9 + MOVL DX, 128(DI) + MOVL CX, 132(DI) + MOVL R8, 136(DI) + MOVL R9, 140(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + XORL 192(SI), DX + XORL 196(SI), CX + XORL 200(SI), R8 + XORL 204(SI), R9 + MOVL DX, 192(DI) + MOVL CX, 196(DI) + MOVL R8, 200(DI) + MOVL R9, 204(DI) + PADDL 240(R12), X14 + PADDL 64(R12), X0 + PADDL 128(R12), X5 + PADDL 192(R12), X8 + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 16(SI), DX + XORL 20(SI), CX + XORL 24(SI), R8 + XORL 28(SI), R9 + MOVL DX, 16(DI) + MOVL CX, 20(DI) + MOVL R8, 24(DI) + MOVL R9, 28(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 80(SI), DX + XORL 84(SI), CX + XORL 88(SI), R8 + XORL 92(SI), R9 + MOVL DX, 80(DI) + MOVL CX, 84(DI) + MOVL R8, 88(DI) + MOVL R9, 92(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 144(SI), DX + XORL 148(SI), CX + XORL 152(SI), R8 + XORL 156(SI), R9 + MOVL DX, 144(DI) + MOVL CX, 148(DI) + MOVL R8, 152(DI) + MOVL R9, 156(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + XORL 208(SI), DX + XORL 212(SI), CX + XORL 216(SI), R8 + XORL 220(SI), R9 + MOVL DX, 208(DI) + MOVL CX, 212(DI) + MOVL R8, 216(DI) + MOVL R9, 220(DI) + PADDL 288(R12), X15 + PADDL 304(R12), X11 + PADDL 80(R12), X1 + PADDL 144(R12), X6 + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 32(SI), DX + XORL 36(SI), CX + XORL 40(SI), R8 + XORL 44(SI), R9 + MOVL DX, 32(DI) + MOVL CX, 36(DI) + MOVL R8, 40(DI) + MOVL R9, 44(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 96(SI), DX + XORL 100(SI), CX + XORL 104(SI), R8 + XORL 108(SI), R9 + MOVL DX, 96(DI) + MOVL CX, 100(DI) + MOVL R8, 104(DI) + MOVL R9, 108(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 160(SI), DX + XORL 164(SI), CX + XORL 168(SI), R8 + XORL 172(SI), R9 + MOVL DX, 160(DI) + MOVL CX, 164(DI) + MOVL R8, 168(DI) + MOVL R9, 172(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + XORL 224(SI), DX + XORL 228(SI), CX + XORL 232(SI), R8 + XORL 236(SI), R9 + MOVL DX, 224(DI) + MOVL CX, 228(DI) + MOVL R8, 232(DI) + MOVL R9, 236(DI) + PADDL 160(R12), X13 + PADDL 208(R12), X9 + PADDL 256(R12), X3 + PADDL 96(R12), X2 + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 48(SI), DX + XORL 52(SI), CX + XORL 56(SI), R8 + XORL 60(SI), R9 + MOVL DX, 48(DI) + MOVL CX, 52(DI) + MOVL R8, 56(DI) + MOVL R9, 60(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 112(SI), DX + XORL 116(SI), CX + XORL 120(SI), R8 + XORL 124(SI), R9 + MOVL DX, 112(DI) + MOVL CX, 116(DI) + MOVL R8, 120(DI) + MOVL R9, 124(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 176(SI), DX + XORL 180(SI), CX + XORL 184(SI), R8 + XORL 188(SI), R9 + MOVL DX, 176(DI) + MOVL CX, 180(DI) + MOVL R8, 184(DI) + MOVL R9, 188(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + XORL 240(SI), DX + XORL 244(SI), CX + XORL 248(SI), R8 + XORL 252(SI), R9 + MOVL DX, 240(DI) + MOVL CX, 244(DI) + MOVL R8, 248(DI) + MOVL R9, 252(DI) + MOVQ 352(R12), R9 + SUBQ $0x00000100, R9 + ADDQ $0x00000100, SI + ADDQ $0x00000100, DI + CMPQ R9, $0x00000100 + JAE BYTESATLEAST256 + CMPQ R9, $0x00 + JBE DONE - MOVQ DX,R9 - MOVQ CX,DX - MOVQ R8,R10 - CMPQ R9,$0 - JBE DONE - START: - MOVL 20(R10),CX - MOVL 0(R10),R8 - MOVL 0(DX),AX - MOVL 16(R10),R11 - MOVL CX,0(R12) - MOVL R8, 4 (R12) - MOVL AX, 8 (R12) - MOVL R11, 12 (R12) - MOVL 8(DX),CX - MOVL 24(R10),R8 - MOVL 4(R10),AX - MOVL 4(DX),R11 - MOVL CX,16(R12) - MOVL R8, 20 (R12) - MOVL AX, 24 (R12) - MOVL R11, 28 (R12) - MOVL 12(DX),CX - MOVL 12(R10),DX - MOVL 28(R10),R8 - MOVL 8(R10),AX - MOVL DX,32(R12) - MOVL CX, 36 (R12) - MOVL R8, 40 (R12) - MOVL AX, 44 (R12) - MOVQ $1634760805,DX - MOVQ $857760878,CX - MOVQ $2036477234,R8 - MOVQ $1797285236,AX - MOVL DX,48(R12) - MOVL CX, 52 (R12) - MOVL R8, 56 (R12) - MOVL AX, 60 (R12) - CMPQ R9,$256 - JB BYTESBETWEEN1AND255 - MOVOA 48(R12),X0 - PSHUFL $0X55,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X3 - PSHUFL $0X00,X0,X0 - MOVOA X1,64(R12) - MOVOA X2,80(R12) - MOVOA X3,96(R12) - MOVOA X0,112(R12) - MOVOA 0(R12),X0 - PSHUFL $0XAA,X0,X1 - PSHUFL $0XFF,X0,X2 - PSHUFL $0X00,X0,X3 - PSHUFL $0X55,X0,X0 - MOVOA X1,128(R12) - MOVOA X2,144(R12) - MOVOA X3,160(R12) - MOVOA X0,176(R12) - MOVOA 16(R12),X0 - PSHUFL $0XFF,X0,X1 - PSHUFL $0X55,X0,X2 - PSHUFL $0XAA,X0,X0 - MOVOA X1,192(R12) - MOVOA X2,208(R12) - MOVOA X0,224(R12) - MOVOA 32(R12),X0 - PSHUFL $0X00,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X0 - MOVOA X1,240(R12) - MOVOA X2,256(R12) - MOVOA X0,272(R12) - BYTESATLEAST256: - MOVL 16(R12),DX - MOVL 36 (R12),CX - MOVL DX,288(R12) - MOVL CX,304(R12) - SHLQ $32,CX - ADDQ CX,DX - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 292 (R12) - MOVL CX, 308 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 296 (R12) - MOVL CX, 312 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 300 (R12) - MOVL CX, 316 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX,16(R12) - MOVL CX, 36 (R12) - MOVQ R9,352(R12) - MOVQ $20,DX - MOVOA 64(R12),X0 - MOVOA 80(R12),X1 - MOVOA 96(R12),X2 - MOVOA 256(R12),X3 - MOVOA 272(R12),X4 - MOVOA 128(R12),X5 - MOVOA 144(R12),X6 - MOVOA 176(R12),X7 - MOVOA 192(R12),X8 - MOVOA 208(R12),X9 - MOVOA 224(R12),X10 - MOVOA 304(R12),X11 - MOVOA 112(R12),X12 - MOVOA 160(R12),X13 - MOVOA 240(R12),X14 - MOVOA 288(R12),X15 - MAINLOOP1: - MOVOA X1,320(R12) - MOVOA X2,336(R12) - MOVOA X13,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X14 - PSRLL $25,X2 - PXOR X2,X14 - MOVOA X7,X1 - PADDL X0,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X11 - PSRLL $25,X2 - PXOR X2,X11 - MOVOA X12,X1 - PADDL X14,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X15 - PSRLL $23,X2 - PXOR X2,X15 - MOVOA X0,X1 - PADDL X11,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X9 - PSRLL $23,X2 - PXOR X2,X9 - MOVOA X14,X1 - PADDL X15,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X13 - PSRLL $19,X2 - PXOR X2,X13 - MOVOA X11,X1 - PADDL X9,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X7 - PSRLL $19,X2 - PXOR X2,X7 - MOVOA X15,X1 - PADDL X13,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA 320(R12),X1 - MOVOA X12,320(R12) - MOVOA X9,X2 - PADDL X7,X2 - MOVOA X2,X12 - PSLLL $18,X2 - PXOR X2,X0 - PSRLL $14,X12 - PXOR X12,X0 - MOVOA X5,X2 - PADDL X1,X2 - MOVOA X2,X12 - PSLLL $7,X2 - PXOR X2,X3 - PSRLL $25,X12 - PXOR X12,X3 - MOVOA 336(R12),X2 - MOVOA X0,336(R12) - MOVOA X6,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X4 - PSRLL $25,X12 - PXOR X12,X4 - MOVOA X1,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X10 - PSRLL $23,X12 - PXOR X12,X10 - MOVOA X2,X0 - PADDL X4,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X8 - PSRLL $23,X12 - PXOR X12,X8 - MOVOA X3,X0 - PADDL X10,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X5 - PSRLL $19,X12 - PXOR X12,X5 - MOVOA X4,X0 - PADDL X8,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X6 - PSRLL $19,X12 - PXOR X12,X6 - MOVOA X10,X0 - PADDL X5,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA 320(R12),X0 - MOVOA X1,320(R12) - MOVOA X4,X1 - PADDL X0,X1 - MOVOA X1,X12 - PSLLL $7,X1 - PXOR X1,X7 - PSRLL $25,X12 - PXOR X12,X7 - MOVOA X8,X1 - PADDL X6,X1 - MOVOA X1,X12 - PSLLL $18,X1 - PXOR X1,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 336(R12),X12 - MOVOA X2,336(R12) - MOVOA X14,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X5 - PSRLL $25,X2 - PXOR X2,X5 - MOVOA X0,X1 - PADDL X7,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X10 - PSRLL $23,X2 - PXOR X2,X10 - MOVOA X12,X1 - PADDL X5,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X8 - PSRLL $23,X2 - PXOR X2,X8 - MOVOA X7,X1 - PADDL X10,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X4 - PSRLL $19,X2 - PXOR X2,X4 - MOVOA X5,X1 - PADDL X8,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X14 - PSRLL $19,X2 - PXOR X2,X14 - MOVOA X10,X1 - PADDL X4,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X0 - PSRLL $14,X2 - PXOR X2,X0 - MOVOA 320(R12),X1 - MOVOA X0,320(R12) - MOVOA X8,X0 - PADDL X14,X0 - MOVOA X0,X2 - PSLLL $18,X0 - PXOR X0,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA X11,X0 - PADDL X1,X0 - MOVOA X0,X2 - PSLLL $7,X0 - PXOR X0,X6 - PSRLL $25,X2 - PXOR X2,X6 - MOVOA 336(R12),X2 - MOVOA X12,336(R12) - MOVOA X3,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X13 - PSRLL $25,X12 - PXOR X12,X13 - MOVOA X1,X0 - PADDL X6,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X15 - PSRLL $23,X12 - PXOR X12,X15 - MOVOA X2,X0 - PADDL X13,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X9 - PSRLL $23,X12 - PXOR X12,X9 - MOVOA X6,X0 - PADDL X15,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X11 - PSRLL $19,X12 - PXOR X12,X11 - MOVOA X13,X0 - PADDL X9,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X3 - PSRLL $19,X12 - PXOR X12,X3 - MOVOA X15,X0 - PADDL X11,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA X9,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 320(R12),X12 - MOVOA 336(R12),X0 - SUBQ $2,DX - JA MAINLOOP1 - PADDL 112(R12),X12 - PADDL 176(R12),X7 - PADDL 224(R12),X10 - PADDL 272(R12),X4 - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 0(SI),DX - XORL 4(SI),CX - XORL 8(SI),R8 - XORL 12(SI),R9 - MOVL DX,0(DI) - MOVL CX,4(DI) - MOVL R8,8(DI) - MOVL R9,12(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 64(SI),DX - XORL 68(SI),CX - XORL 72(SI),R8 - XORL 76(SI),R9 - MOVL DX,64(DI) - MOVL CX,68(DI) - MOVL R8,72(DI) - MOVL R9,76(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 128(SI),DX - XORL 132(SI),CX - XORL 136(SI),R8 - XORL 140(SI),R9 - MOVL DX,128(DI) - MOVL CX,132(DI) - MOVL R8,136(DI) - MOVL R9,140(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - XORL 192(SI),DX - XORL 196(SI),CX - XORL 200(SI),R8 - XORL 204(SI),R9 - MOVL DX,192(DI) - MOVL CX,196(DI) - MOVL R8,200(DI) - MOVL R9,204(DI) - PADDL 240(R12),X14 - PADDL 64(R12),X0 - PADDL 128(R12),X5 - PADDL 192(R12),X8 - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 16(SI),DX - XORL 20(SI),CX - XORL 24(SI),R8 - XORL 28(SI),R9 - MOVL DX,16(DI) - MOVL CX,20(DI) - MOVL R8,24(DI) - MOVL R9,28(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 80(SI),DX - XORL 84(SI),CX - XORL 88(SI),R8 - XORL 92(SI),R9 - MOVL DX,80(DI) - MOVL CX,84(DI) - MOVL R8,88(DI) - MOVL R9,92(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 144(SI),DX - XORL 148(SI),CX - XORL 152(SI),R8 - XORL 156(SI),R9 - MOVL DX,144(DI) - MOVL CX,148(DI) - MOVL R8,152(DI) - MOVL R9,156(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - XORL 208(SI),DX - XORL 212(SI),CX - XORL 216(SI),R8 - XORL 220(SI),R9 - MOVL DX,208(DI) - MOVL CX,212(DI) - MOVL R8,216(DI) - MOVL R9,220(DI) - PADDL 288(R12),X15 - PADDL 304(R12),X11 - PADDL 80(R12),X1 - PADDL 144(R12),X6 - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 32(SI),DX - XORL 36(SI),CX - XORL 40(SI),R8 - XORL 44(SI),R9 - MOVL DX,32(DI) - MOVL CX,36(DI) - MOVL R8,40(DI) - MOVL R9,44(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 96(SI),DX - XORL 100(SI),CX - XORL 104(SI),R8 - XORL 108(SI),R9 - MOVL DX,96(DI) - MOVL CX,100(DI) - MOVL R8,104(DI) - MOVL R9,108(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 160(SI),DX - XORL 164(SI),CX - XORL 168(SI),R8 - XORL 172(SI),R9 - MOVL DX,160(DI) - MOVL CX,164(DI) - MOVL R8,168(DI) - MOVL R9,172(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - XORL 224(SI),DX - XORL 228(SI),CX - XORL 232(SI),R8 - XORL 236(SI),R9 - MOVL DX,224(DI) - MOVL CX,228(DI) - MOVL R8,232(DI) - MOVL R9,236(DI) - PADDL 160(R12),X13 - PADDL 208(R12),X9 - PADDL 256(R12),X3 - PADDL 96(R12),X2 - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 48(SI),DX - XORL 52(SI),CX - XORL 56(SI),R8 - XORL 60(SI),R9 - MOVL DX,48(DI) - MOVL CX,52(DI) - MOVL R8,56(DI) - MOVL R9,60(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 112(SI),DX - XORL 116(SI),CX - XORL 120(SI),R8 - XORL 124(SI),R9 - MOVL DX,112(DI) - MOVL CX,116(DI) - MOVL R8,120(DI) - MOVL R9,124(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 176(SI),DX - XORL 180(SI),CX - XORL 184(SI),R8 - XORL 188(SI),R9 - MOVL DX,176(DI) - MOVL CX,180(DI) - MOVL R8,184(DI) - MOVL R9,188(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - XORL 240(SI),DX - XORL 244(SI),CX - XORL 248(SI),R8 - XORL 252(SI),R9 - MOVL DX,240(DI) - MOVL CX,244(DI) - MOVL R8,248(DI) - MOVL R9,252(DI) - MOVQ 352(R12),R9 - SUBQ $256,R9 - ADDQ $256,SI - ADDQ $256,DI - CMPQ R9,$256 - JAE BYTESATLEAST256 - CMPQ R9,$0 - JBE DONE - BYTESBETWEEN1AND255: - CMPQ R9,$64 - JAE NOCOPY - MOVQ DI,DX - LEAQ 360(R12),DI - MOVQ R9,CX +BYTESBETWEEN1AND255: + CMPQ R9, $0x40 + JAE NOCOPY + MOVQ DI, DX + LEAQ 360(R12), DI + MOVQ R9, CX REP; MOVSB - LEAQ 360(R12),DI - LEAQ 360(R12),SI - NOCOPY: - MOVQ R9,352(R12) - MOVOA 48(R12),X0 - MOVOA 0(R12),X1 - MOVOA 16(R12),X2 - MOVOA 32(R12),X3 - MOVOA X1,X4 - MOVQ $20,CX - MAINLOOP2: - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - SUBQ $4,CX - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PXOR X7,X7 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - JA MAINLOOP2 - PADDL 48(R12),X0 - PADDL 0(R12),X1 - PADDL 16(R12),X2 - PADDL 32(R12),X3 - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 0(SI),CX - XORL 48(SI),R8 - XORL 32(SI),R9 - XORL 16(SI),AX - MOVL CX,0(DI) - MOVL R8,48(DI) - MOVL R9,32(DI) - MOVL AX,16(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 20(SI),CX - XORL 4(SI),R8 - XORL 52(SI),R9 - XORL 36(SI),AX - MOVL CX,20(DI) - MOVL R8,4(DI) - MOVL R9,52(DI) - MOVL AX,36(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 40(SI),CX - XORL 24(SI),R8 - XORL 8(SI),R9 - XORL 56(SI),AX - MOVL CX,40(DI) - MOVL R8,24(DI) - MOVL R9,8(DI) - MOVL AX,56(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - XORL 60(SI),CX - XORL 44(SI),R8 - XORL 28(SI),R9 - XORL 12(SI),AX - MOVL CX,60(DI) - MOVL R8,44(DI) - MOVL R9,28(DI) - MOVL AX,12(DI) - MOVQ 352(R12),R9 - MOVL 16(R12),CX - MOVL 36 (R12),R8 - ADDQ $1,CX - SHLQ $32,R8 - ADDQ R8,CX - MOVQ CX,R8 - SHRQ $32,R8 - MOVL CX,16(R12) - MOVL R8, 36 (R12) - CMPQ R9,$64 - JA BYTESATLEAST65 - JAE BYTESATLEAST64 - MOVQ DI,SI - MOVQ DX,DI - MOVQ R9,CX + LEAQ 360(R12), DI + LEAQ 360(R12), SI + +NOCOPY: + MOVQ R9, 352(R12) + MOVOA 48(R12), X0 + MOVOA (R12), X1 + MOVOA 16(R12), X2 + MOVOA 32(R12), X3 + MOVOA X1, X4 + MOVQ $0x00000014, CX + +MAINLOOP2: + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + SUBQ $0x04, CX + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PXOR X7, X7 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + JA MAINLOOP2 + PADDL 48(R12), X0 + PADDL (R12), X1 + PADDL 16(R12), X2 + PADDL 32(R12), X3 + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL (SI), CX + XORL 48(SI), R8 + XORL 32(SI), R9 + XORL 16(SI), AX + MOVL CX, (DI) + MOVL R8, 48(DI) + MOVL R9, 32(DI) + MOVL AX, 16(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 20(SI), CX + XORL 4(SI), R8 + XORL 52(SI), R9 + XORL 36(SI), AX + MOVL CX, 20(DI) + MOVL R8, 4(DI) + MOVL R9, 52(DI) + MOVL AX, 36(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 40(SI), CX + XORL 24(SI), R8 + XORL 8(SI), R9 + XORL 56(SI), AX + MOVL CX, 40(DI) + MOVL R8, 24(DI) + MOVL R9, 8(DI) + MOVL AX, 56(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + XORL 60(SI), CX + XORL 44(SI), R8 + XORL 28(SI), R9 + XORL 12(SI), AX + MOVL CX, 60(DI) + MOVL R8, 44(DI) + MOVL R9, 28(DI) + MOVL AX, 12(DI) + MOVQ 352(R12), R9 + MOVL 16(R12), CX + MOVL 36(R12), R8 + ADDQ $0x01, CX + SHLQ $0x20, R8 + ADDQ R8, CX + MOVQ CX, R8 + SHRQ $0x20, R8 + MOVL CX, 16(R12) + MOVL R8, 36(R12) + CMPQ R9, $0x40 + JA BYTESATLEAST65 + JAE BYTESATLEAST64 + MOVQ DI, SI + MOVQ DX, DI + MOVQ R9, CX REP; MOVSB - BYTESATLEAST64: - DONE: + +BYTESATLEAST64: +DONE: RET - BYTESATLEAST65: - SUBQ $64,R9 - ADDQ $64,DI - ADDQ $64,SI - JMP BYTESBETWEEN1AND255 + +BYTESATLEAST65: + SUBQ $0x40, R9 + ADDQ $0x40, DI + ADDQ $0x40, SI + JMP BYTESBETWEEN1AND255 From 7eace71069e621a910a5158a1b46314d38f724ae Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Mon, 29 Jul 2024 17:49:34 -0400 Subject: [PATCH 37/57] chacha20poly1305: Avo port of chacha20poly1305_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Parameter metadata not found in the reference assembly file has been added, leading to a diff on the lines where those symbols are referenced. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="chacha20poly1305/chacha20poly1305_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) 155,157c155,157 < MOVQ dst(FP), DI < MOVQ key+24(FP), R8 < MOVQ src+48(FP), SI --- > MOVQ dst_base(FP), DI > MOVQ key_base+24(FP), R8 > MOVQ src_base+48(FP), SI 159c159 < MOVQ ad+72(FP), CX --- > MOVQ ad_base+72(FP), CX 4684,4686c4684,4686 < MOVQ dst(FP), DI < MOVQ key+24(FP), R8 < MOVQ src+48(FP), SI --- > MOVQ dst_base(FP), DI > MOVQ key_base+24(FP), R8 > MOVQ src_base+48(FP), SI 4688c4688 < MOVQ ad+72(FP), CX --- > MOVQ ad_base+72(FP), CX Change-Id: Ia3a8e70b7440944ee739499c41ddceb70e054ef9 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601442 Reviewed-by: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov --- .../_asm/chacha20poly1305_amd64_asm.go | 5516 ++++++++ chacha20poly1305/_asm/go.mod | 15 + chacha20poly1305/_asm/go.sum | 12 + chacha20poly1305/chacha20poly1305_amd64.s | 11503 +++++++++++++--- 4 files changed, 14818 insertions(+), 2228 deletions(-) create mode 100644 chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go create mode 100644 chacha20poly1305/_asm/go.mod create mode 100644 chacha20poly1305/_asm/go.sum diff --git a/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go new file mode 100644 index 0000000000..e9ba153b4c --- /dev/null +++ b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go @@ -0,0 +1,5516 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This assembly implementation was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. + +package main + +import ( + "fmt" + "os" + "strings" + + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/chacha20poly1305" +) + +//go:generate go run . -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305 + +var ( + // General register allocation + oup GPPhysical = RDI + inp = RSI + inl = RBX + adp = RCX // free to reuse, after we hash the additional data + keyp = R8 // free to reuse, when we copy the key to stack + itr2 = R9 // general iterator + itr1 = RCX // general iterator + acc0 = R10 + acc1 = R11 + acc2 = R12 + t0 = R13 + t1 = R14 + t2 = R15 + t3 = R8 + + // Register and stack allocation for the SSE code + rStore Mem = Mem{Base: BP}.Offset(0 * 16) + sStore = Mem{Base: BP}.Offset(1 * 16) + state1Store = Mem{Base: BP}.Offset(2 * 16) + state2Store = Mem{Base: BP}.Offset(3 * 16) + tmpStore = Mem{Base: BP}.Offset(4 * 16) + ctr0Store = Mem{Base: BP}.Offset(5 * 16) + ctr1Store = Mem{Base: BP}.Offset(6 * 16) + ctr2Store = Mem{Base: BP}.Offset(7 * 16) + ctr3Store = Mem{Base: BP}.Offset(8 * 16) + A0 VecPhysical = X0 + A1 = X1 + A2 = X2 + B0 = X3 + B1 = X4 + B2 = X5 + C0 = X6 + C1 = X7 + C2 = X8 + D0 = X9 + D1 = X10 + D2 = X11 + T0 = X12 + T1 = X13 + T2 = X14 + T3 = X15 + A3 = T0 + B3 = T1 + C3 = T2 + D3 = T3 + + // Register and stack allocation for the AVX2 code + rsStoreAVX2 Mem = Mem{Base: BP}.Offset(0 * 32) + state1StoreAVX2 = Mem{Base: BP}.Offset(1 * 32) + state2StoreAVX2 = Mem{Base: BP}.Offset(2 * 32) + ctr0StoreAVX2 = Mem{Base: BP}.Offset(3 * 32) + ctr1StoreAVX2 = Mem{Base: BP}.Offset(4 * 32) + ctr2StoreAVX2 = Mem{Base: BP}.Offset(5 * 32) + ctr3StoreAVX2 = Mem{Base: BP}.Offset(6 * 32) + tmpStoreAVX2 = Mem{Base: BP}.Offset(7 * 32) // 256 bytes on stack + AA0 VecPhysical = Y0 + AA1 = Y5 + AA2 = Y6 + AA3 = Y7 + BB0 = Y14 + BB1 = Y9 + BB2 = Y10 + BB3 = Y11 + CC0 = Y12 + CC1 = Y13 + CC2 = Y8 + CC3 = Y15 + DD0 = Y4 + DD1 = Y1 + DD2 = Y2 + DD3 = Y3 + TT0 = DD3 + TT1 = AA3 + TT2 = BB3 + TT3 = CC3 +) + +const ThatPeskyUnicodeDot = "\u00b7" + +func main() { + Package("golang.org/x/crypto/chacha20poly1305") + ConstraintExpr("gc,!purego") + polyHashADInternal() + chacha20Poly1305Open() + chacha20Poly1305Seal() + Generate() + + var internalFunctions []string = []string{"·polyHashADInternal"} + removePeskyUnicodeDot(internalFunctions, "../chacha20poly1305_amd64.s") +} + +// Utility function to emit BYTE instruction +func BYTE(u8 U8) { + Instruction(&ir.Instruction{Opcode: "BYTE", Operands: []Op{u8}}) +} + +// PALIGNR $4, X3, X3 +func shiftB0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X4, X4 +func shiftB1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xe4)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X5, X5 +func shiftB2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X13, X13 +func shiftB3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x04)) +} + +// PALIGNR $8, X6, X6 +func shiftC0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xf6)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X7, X7 +func shiftC1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X8, X8 +func shiftC2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc0)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X14, X14 +func shiftC3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xf6)) + BYTE(U8(0x08)) +} + +// PALIGNR $12, X9, X9 +func shiftD0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc9)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X10, X10 +func shiftD1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xd2)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X11, X11 +func shiftD2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X15, X15 +func shiftD3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X3, X3 +func shiftB0Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X4, X4 +func shiftB1Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xe4)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X5, X5 +func shiftB2Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X13, X13 +func shiftB3Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x0c)) +} + +func shiftC0Right() { + shiftC0Left() +} + +func shiftC1Right() { + shiftC1Left() +} + +func shiftC2Right() { + shiftC2Left() +} + +func shiftC3Right() { + shiftC3Left() +} + +// PALIGNR $4, X9, X9 +func shiftD0Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc9)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X10, X10 +func shiftD1Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xd2)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X11, X11 +func shiftD2Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X15, X15 +func shiftD3Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x04)) +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~SOME MACROS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +// Hack: ROL must be a #define macro as it is referenced by other macros +func defineROL() { + definition := + `#define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R` + Comment("ROL rotates the uint32s in register R left by N bits, using temporary T.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// ROL rotates the uint32s in register R left by N bits, using temporary T. +func ROL(N uint64, R, T VecPhysical) { + // Hack: ROL must be a #define macro as it is referenced by other macros + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL(%s, %s, %s)", I8(N).Asm(), R.Asm(), T.Asm())}) +} + +// Hack to get Avo to generate an #ifdef +// +// ROL16(R, T) definition depends on a compiler flag that specifies amd64 architectural level. +func defineROL16() { + definition := + `#ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif` + + Comment("ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// Hack to emit macro call +// +// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. +func ROL16(R, T VecPhysical) { + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL16(%s, %s)", R.Asm(), T.Asm())}) +} + +// Hack to get Avo to generate an #ifdef +// +// ROL8(R, T) definition depends on a compiler flag that specifies amd64 architectural level. +func defineROL8() { + definition := + `#ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif` + + Comment("ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// Hack to emit macro call +// +// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. +func ROL8(R, T VecPhysical) { + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL8(%s, %s)", R.Asm(), T.Asm())}) +} + +func chachaQR(A, B, C, D, T VecPhysical) { + PADDD(B, A) + PXOR(A, D) + ROL16(D, T) + PADDD(D, C) + PXOR(C, B) + MOVO(B, T) + PSLLL(Imm(12), T) + PSRLL(Imm(20), B) + PXOR(T, B) + PADDD(B, A) + PXOR(A, D) + ROL8(D, T) + PADDD(D, C) + PXOR(C, B) + MOVO(B, T) + PSLLL(Imm(7), T) + PSRLL(Imm(25), B) + PXOR(T, B) +} + +func chachaQR_AVX2(A, B, C, D, T VecPhysical) { + VPADDD(B, A, A) + VPXOR(A, D, D) + rol16 := rol16_DATA() + VPSHUFB(rol16, D, D) + VPADDD(D, C, C) + VPXOR(C, B, B) + VPSLLD(Imm(12), B, T) + VPSRLD(Imm(20), B, B) + VPXOR(T, B, B) + VPADDD(B, A, A) + VPXOR(A, D, D) + rol8 := rol8_DATA() + VPSHUFB(rol8, D, D) + VPADDD(D, C, C) + VPXOR(C, B, B) + VPSLLD(Imm(7), B, T) + VPSRLD(Imm(25), B, B) + VPXOR(T, B, B) +} + +func polyAdd(S Mem) { + ADDQ(S, acc0) + ADCQ(S.Offset(8), acc1) + ADCQ(Imm(1), acc2) +} + +func polyMulStage1() { + MOVQ(Mem{Base: BP}.Offset(0*8), RAX) + MOVQ(RAX, t2) + MULQ(acc0) + MOVQ(RAX, t0) + MOVQ(RDX, t1) + MOVQ(Mem{Base: BP}.Offset(0*8), RAX) + MULQ(acc1) + IMULQ(acc2, t2) + ADDQ(RAX, t1) + ADCQ(RDX, t2) +} + +func polyMulStage2() { + MOVQ(Mem{Base: BP}.Offset(1*8), RAX) + MOVQ(RAX, t3) + MULQ(acc0) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc0) + MOVQ(Mem{Base: BP}.Offset(1*8), RAX) + MULQ(acc1) + ADDQ(RAX, t2) + ADCQ(Imm(0), RDX) +} + +func polyMulStage3() { + IMULQ(acc2, t3) + ADDQ(acc0, t2) + ADCQ(RDX, t3) +} + +func polyMulReduceStage() { + MOVQ(t0, acc0) + MOVQ(t1, acc1) + MOVQ(t2, acc2) + ANDQ(Imm(3), acc2) + MOVQ(t2, t0) + ANDQ(I8(-4), t0) + MOVQ(t3, t1) + SHRQ(Imm(2), t3, t2) + SHRQ(Imm(2), t3) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(0), acc2) + ADDQ(t2, acc0) + ADCQ(t3, acc1) + ADCQ(Imm(0), acc2) +} + +func polyMulStage1_AVX2() { + MOVQ(Mem{Base: BP}.Offset(0*8), RDX) + MOVQ(RDX, t2) + MULXQ(acc0, t0, t1) + IMULQ(acc2, t2) + MULXQ(acc1, RAX, RDX) + ADDQ(RAX, t1) + ADCQ(RDX, t2) +} + +func polyMulStage2_AVX2() { + MOVQ(Mem{Base: BP}.Offset(1*8), RDX) + MULXQ(acc0, acc0, RAX) + ADDQ(acc0, t1) + MULXQ(acc1, acc1, t3) + ADCQ(acc1, t2) + ADCQ(Imm(0), t3) +} + +func polyMulStage3_AVX2() { + IMULQ(acc2, RDX) + ADDQ(RAX, t2) + ADCQ(RDX, t3) +} + +func polyMul() { + polyMulStage1() + polyMulStage2() + polyMulStage3() + polyMulReduceStage() +} + +func polyMulAVX2() { + polyMulStage1_AVX2() + polyMulStage2_AVX2() + polyMulStage3_AVX2() + polyMulReduceStage() +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +func polyHashADInternal() { + Function("polyHashADInternal<>") + Attributes(NOSPLIT) + AllocLocal(0) + + Comment("Hack: Must declare #define macros inside of a function due to Avo constraints") + defineROL() + defineROL8() + defineROL16() + + // adp points to beginning of additional data + // itr2 holds ad length + XORQ(acc0, acc0) + XORQ(acc1, acc1) + XORQ(acc2, acc2) + CMPQ(itr2, Imm(13)) + JNE(LabelRef("hashADLoop")) + + openFastTLSAD() + hashADLoop() + hashADTail() + hashADTailLoop() + hashADTailFinish() + hashADDone() +} + +// Special treatment for the TLS case of 13 bytes +func openFastTLSAD() { + Label("openFastTLSAD") + MOVQ(Mem{Base: adp}, acc0) + MOVQ(Mem{Base: adp}.Offset(5), acc1) + SHRQ(Imm(24), acc1) + MOVQ(U32(1), acc2) + polyMul() + RET() +} + +// Hash in 16 byte chunks +func hashADLoop() { + Label("hashADLoop") + Comment("Hash in 16 byte chunks") + CMPQ(itr2, Imm(16)) + JB(LabelRef("hashADTail")) + polyAdd(Mem{Base: adp}.Offset(0)) + LEAQ(Mem{Base: adp}.Offset(1*16), adp) + SUBQ(Imm(16), itr2) + polyMul() + JMP(LabelRef("hashADLoop")) +} + +func hashADTail() { + Label("hashADTail") + CMPQ(itr2, Imm(0)) + JE(LabelRef("hashADDone")) + + Comment("Hash last < 16 byte tail") + XORQ(t0, t0) + XORQ(t1, t1) + XORQ(t2, t2) + ADDQ(itr2, adp) +} + +func hashADTailLoop() { + Label("hashADTailLoop") + SHLQ(Imm(8), t0, t1) + SHLQ(Imm(8), t0) + // Hack to get Avo to emit: + // MOVB -1(adp), t2 + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: adp}.Offset(-1), t2}}) + XORQ(t2, t0) + DECQ(adp) + DECQ(itr2) + JNE(LabelRef("hashADTailLoop")) +} + +func hashADTailFinish() { + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() +} + +// Finished AD +func hashADDone() { + Label("hashADDone") + RET() +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +// Implements the following function fignature: +// +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +func chacha20Poly1305Open() { + Implement("chacha20Poly1305Open") + Attributes(0) + AllocLocal(288) + + Comment("For aligned stack access") + MOVQ(RSP, RBP) + ADDQ(Imm(32), RBP) + ANDQ(I8(-32), RBP) + + Load(Param("dst").Base(), oup) + Load(Param("key").Base(), keyp) + Load(Param("src").Base(), inp) + Load(Param("src").Len(), inl) + Load(Param("ad").Base(), adp) + + Comment("Check for AVX2 support") + CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("chacha20Poly1305Open_AVX2")) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(128)) + JBE(LabelRef("openSSE128")) // About 16% faster + + Comment("For long buffers, prepare the poly key first") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(D0, T1) + + Comment("Store state on stack for future use") + MOVO(B0, state1Store) + MOVO(C0, state2Store) + MOVO(D0, ctr3Store) + MOVQ(U32(10), itr2) + + openSSEPreparePolyKey() + openSSEMainLoop() + openSSEInternalLoop() + openSSEMainLoopDone() + openSSEFinalize() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 129 bytes + openSSE128() + openSSE128InnerCipherLoop() + openSSE128Open() + openSSETail16() + openSSETail16Store() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 64 bytes of ciphertext + openSSETail64() + openSSETail64LoopA() + openSSETail64LoopB() + openSSETail64DecLoop() + openSSETail64DecLoopDone() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + openSSETail128() + openSSETail128LoopA() + openSSETail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 192 bytes of ciphertext + openSSETail192() + openSSLTail192LoopA() + openSSLTail192LoopB() + openSSLTail192Store() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + openSSETail256() + openSSETail256Loop() + openSSETail256HashLoop() + + // ---------------------------------------------------------------------------- + // ------------------------- AVX2 Code ---------------------------------------- + chacha20Poly1305Open_AVX2() + openAVX2PreparePolyKey() + openAVX2InitialHash64() + openAVX2MainLoop() + openAVX2InternalLoop() + openAVX2MainLoopDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 193 bytes + openAVX2192() + openAVX2192InnerCipherLoop() + openAVX2ShortOpen() + openAVX2ShortOpenLoop() + openAVX2ShortTail32() + openAVX2ShortDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 321 bytes + openAVX2320() + openAVX2320InnerCipherLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + openAVX2Tail128() + openAVX2Tail128LoopA() + openAVX2Tail128LoopB() + openAVX2TailLoop() + openAVX2Tail() + openAVX2TailDone() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + openAVX2Tail256() + openAVX2Tail256LoopA() + openAVX2Tail256LoopB() + openAVX2Tail256Hash() + openAVX2Tail256HashEnd() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 384 bytes of ciphertext + openAVX2Tail384() + openAVX2Tail384LoopB() + openAVX2Tail384LoopA() + openAVX2Tail384Hash() + openAVX2Tail384HashEnd() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 512 bytes of ciphertext + openAVX2Tail512() + openAVX2Tail512LoopB() + openAVX2Tail512LoopA() + openAVX2Tail512HashLoop() + openAVX2Tail512HashEnd() +} + +func openSSEPreparePolyKey() { + Label("openSSEPreparePolyKey") + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + DECQ(itr2) + JNE(LabelRef("openSSEPreparePolyKey")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(state1Store, B0) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVO(A0, rStore) + MOVO(B0, sStore) + + Comment("Hash AAD") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openSSEMainLoop() { + Label("openSSEMainLoop") + CMPQ(inl, U32(256)) + JB(LabelRef("openSSEMainLoopDone")) + + chacha20Constants := chacha20Constants_DATA() + sseIncMask := sseIncMask_DATA() + + Comment("Load state, increment counter blocks") + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + + Comment("There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash") + Comment("2 blocks, and for the remaining 4 only 1 block - for a total of 16") + MOVQ(U32(4), itr1) + MOVQ(inp, itr2) +} + +func openSSEInternalLoop() { + Label("openSSEInternalLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyAdd(Mem{Base: itr2}.Offset(0)) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + polyMulStage3() + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr1) + JGE(LabelRef("openSSEInternalLoop")) + + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMul() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) + + CMPQ(itr1, I8(-6)) + JG(LabelRef("openSSEInternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + Comment("Add in the state") + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + + Comment("Load - xor - store") + MOVO(D3, tmpStore) + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), D0) + PXOR(D0, A1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(Mem{Base: inp}.Offset(5*16), D0) + PXOR(D0, B1) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(Mem{Base: inp}.Offset(6*16), D0) + PXOR(D0, C1) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(D0, D1) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), D0) + PXOR(D0, A2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(Mem{Base: inp}.Offset(9*16), D0) + PXOR(D0, B2) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(Mem{Base: inp}.Offset(10*16), D0) + PXOR(D0, C2) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(D0, D2) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + MOVOU(Mem{Base: inp}.Offset(12*16), D0) + PXOR(D0, A3) + MOVOU(A3, Mem{Base: oup}.Offset(12*16)) + MOVOU(Mem{Base: inp}.Offset(13*16), D0) + PXOR(D0, B3) + MOVOU(B3, Mem{Base: oup}.Offset(13*16)) + MOVOU(Mem{Base: inp}.Offset(14*16), D0) + PXOR(D0, C3) + MOVOU(C3, Mem{Base: oup}.Offset(14*16)) + MOVOU(Mem{Base: inp}.Offset(15*16), D0) + PXOR(tmpStore, D0) + MOVOU(D0, Mem{Base: oup}.Offset(15*16)) + LEAQ(Mem{Base: inp}.Offset(256), inp) + LEAQ(Mem{Base: oup}.Offset(256), oup) + SUBQ(U32(256), inl) + JMP(LabelRef("openSSEMainLoop")) +} + +func openSSEMainLoopDone() { + Label("openSSEMainLoopDone") + Comment("Handle the various tail sizes efficiently") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + CMPQ(inl, Imm(64)) + JBE(LabelRef("openSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("openSSETail128")) + CMPQ(inl, Imm(192)) + JBE(LabelRef("openSSETail192")) + JMP(LabelRef("openSSETail256")) +} + +func openSSEFinalize() { + Label("openSSEFinalize") + Comment("Hash in the PT, AAD lengths") + ADDQ(NewParamAddr("ad_len", 80), acc0) + ADCQ(NewParamAddr("src_len", 56), acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Final reduce") + MOVQ(acc0, t0) + MOVQ(acc1, t1) + MOVQ(acc2, t2) + SUBQ(I8(-5), acc0) + SBBQ(I8(-1), acc1) + SBBQ(Imm(3), acc2) + CMOVQCS(t0, acc0) + CMOVQCS(t1, acc1) + CMOVQCS(t2, acc2) + + Comment("Add in the \"s\" part of the key") + ADDQ(sStore.Offset(0), acc0) + ADCQ(sStore.Offset(8), acc1) + + Comment("Finally, constant time compare to the tag at the end of the message") + XORQ(RAX, RAX) + MOVQ(U32(1), RDX) + XORQ(Mem{Base: inp}.Offset(0*8), acc0) + XORQ(Mem{Base: inp}.Offset(1*8), acc1) + ORQ(acc1, acc0) + CMOVQEQ(RDX, RAX) + + Comment("Return true iff tags are equal") + // Hack to get Avo to emit: + // MOVB AX, ret+96(FP) + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{AX, NewParamAddr("ret", 96)}}) + RET() +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 129 bytes + +// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks +func openSSE128() { + Label("openSSE128") + + chacha20Constants := chacha20Constants_DATA() + sseIncMask := sseIncMask_DATA() + + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(B0, T1) + MOVO(C0, T2) + MOVO(D1, T3) + MOVQ(U32(10), itr2) +} + +func openSSE128InnerCipherLoop() { + Label("openSSE128InnerCipherLoop") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + DECQ(itr2) + JNE(LabelRef("openSSE128InnerCipherLoop")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(T1, B0) + PADDL(T1, B1) + PADDL(T1, B2) + PADDL(T2, C1) + PADDL(T2, C2) + PADDL(T3, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, T3) + PADDL(T3, D2) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVOU(A0, rStore) + MOVOU(B0, sStore) + + Comment("Hash") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openSSE128Open() { + Label("openSSE128Open") + CMPQ(inl, Imm(16)) + JB(LabelRef("openSSETail16")) + SUBQ(Imm(16), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0)) + + Comment("Load for decryption") + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A1) + MOVOU(A1, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + polyMul() + + Comment("Shift the stream \"left\"") + MOVO(B1, A1) + MOVO(C1, B1) + MOVO(D1, C1) + MOVO(A2, D1) + MOVO(B2, A2) + MOVO(C2, B2) + MOVO(D2, C2) + JMP(LabelRef("openSSE128Open")) +} + +func openSSETail16() { + Label("openSSETail16") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + + Comment("We can safely load the CT from the end, because it is padded with the MAC") + MOVQ(inl, itr2) + SHLQ(Imm(4), itr2) + andMask := andMask_DATA() + LEAQ(andMask, t0) + MOVOU(Mem{Base: inp}, T0) + ADDQ(inl, inp) + PAND(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0) + MOVO(T0, tmpStore.Offset(0)) + MOVQ(T0, t0) + MOVQ(tmpStore.Offset(8), t1) + PXOR(A1, T0) +} + +func openSSETail16Store() { + Comment("We can only store one byte at a time, since plaintext can be shorter than 16 bytes") + Label("openSSETail16Store") + MOVQ(T0, t3) + // Hack to get Avo to emit: + // MOVB t3, (oup) + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{t3, Mem{Base: oup}}}) + PSRLDQ(Imm(1), T0) + INCQ(oup) + DECQ(inl) + JNE(LabelRef("openSSETail16Store")) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + JMP(LabelRef("openSSEFinalize")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of ciphertext + +// Need to decrypt up to 64 bytes - prepare single block +func openSSETail64() { + Label("openSSETail64") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + XORQ(itr2, itr2) + MOVQ(inl, itr1) + CMPQ(itr1, Imm(16)) + JB(LabelRef("openSSETail64LoopB")) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSETail64LoopA() { + Label("openSSETail64LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() + SUBQ(Imm(16), itr1) +} + +func openSSETail64LoopB() { + Label("openSSETail64LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + + CMPQ(itr1, Imm(16)) + JAE(LabelRef("openSSETail64LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSETail64LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(state1Store, B0) + PADDL(state2Store, C0) + PADDL(ctr0Store, D0) +} + +func openSSETail64DecLoop() { + Label("openSSETail64DecLoop") + CMPQ(inl, Imm(16)) + JB(LabelRef("openSSETail64DecLoopDone")) + SUBQ(Imm(16), inl) + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A0) + MOVOU(A0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(16), inp) + LEAQ(Mem{Base: oup}.Offset(16), oup) + MOVO(B0, A0) + MOVO(C0, B0) + MOVO(D0, C0) + JMP(LabelRef("openSSETail64DecLoop")) +} + +func openSSETail64DecLoopDone() { + Label("openSSETail64DecLoopDone") + MOVO(A0, A1) + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext + +// Need to decrypt up to 128 bytes - prepare two blocks +func openSSETail128() { + Label("openSSETail128") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A1) + MOVO(state1Store, B1) + MOVO(state2Store, C1) + MOVO(ctr3Store, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(D1, ctr0Store) + MOVO(A1, A0) + MOVO(B1, B0) + MOVO(C1, C0) + MOVO(D1, D0) + PADDL(sseIncMask, D0) + MOVO(D0, ctr1Store) + XORQ(itr2, itr2) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSETail128LoopA() { + Label("openSSETail128LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() +} + +func openSSETail128LoopB() { + Label("openSSETail128LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + + CMPQ(itr2, itr1) + JB(LabelRef("openSSETail128LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSETail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(ctr1Store, D0) + PADDL(ctr0Store, D1) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(0*16)) + MOVOU(B1, Mem{Base: oup}.Offset(1*16)) + MOVOU(C1, Mem{Base: oup}.Offset(2*16)) + MOVOU(D1, Mem{Base: oup}.Offset(3*16)) + + SUBQ(Imm(64), inl) + LEAQ(Mem{Base: inp}.Offset(64), inp) + LEAQ(Mem{Base: oup}.Offset(64), oup) + JMP(LabelRef("openSSETail64DecLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of ciphertext + +// Need to decrypt up to 192 bytes - prepare three blocks +func openSSETail192() { + Label("openSSETail192") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A2) + MOVO(state1Store, B2) + MOVO(state2Store, C2) + MOVO(ctr3Store, D2) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D2) + MOVO(D2, ctr0Store) + MOVO(A2, A1) + MOVO(B2, B1) + MOVO(C2, C1) + MOVO(D2, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) + MOVO(A1, A0) + MOVO(B1, B0) + MOVO(C1, C0) + MOVO(D1, D0) + PADDL(sseIncMask, D0) + MOVO(D0, ctr2Store) + + MOVQ(inl, itr1) + MOVQ(U32(160), itr2) + CMPQ(itr1, Imm(160)) + CMOVQGT(itr2, itr1) + ANDQ(I8(-16), itr1) + XORQ(itr2, itr2) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSLTail192LoopA() { + Label("openSSLTail192LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() +} + +func openSSLTail192LoopB() { + Label("openSSLTail192LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + shiftB2Left() + shiftC2Left() + shiftD2Left() + + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + shiftB2Right() + shiftC2Right() + shiftD2Right() + + CMPQ(itr2, itr1) + JB(LabelRef("openSSLTail192LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSLTail192LoopB")) + + CMPQ(inl, Imm(176)) + JB(LabelRef("openSSLTail192Store")) + + polyAdd(Mem{Base: inp}.Offset(160)) + polyMul() + + CMPQ(inl, Imm(192)) + JB(LabelRef("openSSLTail192Store")) + + polyAdd(Mem{Base: inp}.Offset(176)) + polyMul() +} + +func openSSLTail192Store() { + Label("openSSLTail192Store") + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state1Store, B2) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(state2Store, C2) + PADDL(ctr2Store, D0) + PADDL(ctr1Store, D1) + PADDL(ctr0Store, D2) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A2) + PXOR(T1, B2) + PXOR(T2, C2) + PXOR(T3, D2) + MOVOU(A2, Mem{Base: oup}.Offset(0*16)) + MOVOU(B2, Mem{Base: oup}.Offset(1*16)) + MOVOU(C2, Mem{Base: oup}.Offset(2*16)) + MOVOU(D2, Mem{Base: oup}.Offset(3*16)) + + MOVOU(Mem{Base: inp}.Offset(4*16), T0) + MOVOU(Mem{Base: inp}.Offset(5*16), T1) + MOVOU(Mem{Base: inp}.Offset(6*16), T2) + MOVOU(Mem{Base: inp}.Offset(7*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + LEAQ(Mem{Base: oup}.Offset(128), oup) + JMP(LabelRef("openSSETail64DecLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare four blocks +func openSSETail256() { + Label("openSSETail256") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + XORQ(itr2, itr2) +} + +// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication +func openSSETail256Loop() { + Label("openSSETail256Loop") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulStage3() + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + ADDQ(Imm(2*8), itr2) + CMPQ(itr2, Imm(160)) + JB(LabelRef("openSSETail256Loop")) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) +} + +func openSSETail256HashLoop() { + Label("openSSETail256HashLoop") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() + ADDQ(Imm(2*8), itr2) + CMPQ(itr2, itr1) + JB(LabelRef("openSSETail256HashLoop")) + + Comment("Add in the state") + chacha20Constants := chacha20Constants_DATA() + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + MOVO(D3, tmpStore) + + Comment("Load - xor - store") + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), A0) + MOVOU(Mem{Base: inp}.Offset(9*16), B0) + MOVOU(Mem{Base: inp}.Offset(10*16), C0) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + LEAQ(Mem{Base: inp}.Offset(192), inp) + LEAQ(Mem{Base: oup}.Offset(192), oup) + SUBQ(Imm(192), inl) + MOVO(A3, A0) + MOVO(B3, B0) + MOVO(C3, C0) + MOVO(tmpStore, D0) + + JMP(LabelRef("openSSETail64DecLoop")) +} + +// Functions to emit AVX instructions via BYTE directive + +// broadcasti128 16(r8), ymm14 +func VBROADCASTI128_16_R8_YMM14() { + BYTE(U8(0xc4)) + BYTE(U8(0x42)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x70)) + BYTE(U8(0x10)) +} + +// broadcasti128 32(r8), ymm12 +func VBROADCASTI128_32_R8_YMM12() { + BYTE(U8(0xc4)) + BYTE(U8(0x42)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x60)) + BYTE(U8(0x20)) +} + +// broadcasti128 48(r8), ymm4 +func VBROADCASTI128_48_R8_YMM4() { + BYTE(U8(0xc4)) + BYTE(U8(0xc2)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x60)) + BYTE(U8(0x30)) +} + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- + +func chacha20Poly1305Open_AVX2() { + Label("chacha20Poly1305Open_AVX2") + VZEROUPPER() + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VBROADCASTI128_16_R8_YMM14() + VBROADCASTI128_32_R8_YMM12() + VBROADCASTI128_48_R8_YMM4() + avx2InitMask := avx2InitMask_DATA() + VPADDD(avx2InitMask, DD0, DD0) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(192)) + JBE(LabelRef("openAVX2192")) + CMPQ(inl, U32(320)) + JBE(LabelRef("openAVX2320")) + + Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream") + VMOVDQA(BB0, state1StoreAVX2) + VMOVDQA(CC0, state2StoreAVX2) + VMOVDQA(DD0, ctr3StoreAVX2) + MOVQ(U32(10), itr2) +} + +func openAVX2PreparePolyKey() { + Label("openAVX2PreparePolyKey") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + DECQ(itr2) + JNE(LabelRef("openAVX2PreparePolyKey")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(ctr3StoreAVX2, DD0, DD0) + + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for the first 64 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + + Comment("Hash AD + first 64 bytes") + // MOVQ ad_len+80(FP), itr2 + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +func openAVX2InitialHash64() { + Label("openAVX2InitialHash64") + // polyAdd(0(inp)(itr1*1)) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0)) + polyMulAVX2() + ADDQ(Imm(16), itr1) + CMPQ(itr1, Imm(64)) + JNE(LabelRef("openAVX2InitialHash64")) + + Comment("Decrypt the first 64 bytes") + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(1*32)) + LEAQ(Mem{Base: inp}.Offset(2*32), inp) + LEAQ(Mem{Base: oup}.Offset(2*32), oup) + SUBQ(Imm(64), inl) +} + +func openAVX2MainLoop() { + Label("openAVX2MainLoop") + CMPQ(inl, U32(512)) + JB(LabelRef("openAVX2MainLoopDone")) + + Comment("Load state, increment counter blocks, store the incremented counters") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + XORQ(itr1, itr1) +} + +// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications +// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext +func openAVX2InternalLoop() { + Label("openAVX2InternalLoop") + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0 * 8)) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage1_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulStage2_AVX2() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyMulStage3_AVX2() + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(2 * 8)) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage1_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage2_AVX2() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage3_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulReduceStage() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(4 * 8)) + LEAQ(Mem{Base: itr1}.Offset(6*8), itr1) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage1_AVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + polyMulStage2_AVX2() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage3_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + CMPQ(itr1, U32(480)) + JNE(LabelRef("openAVX2InternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + + Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here") + polyAdd(Mem{Base: inp}.Offset(480)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + Comment("and here") + polyAdd(Mem{Base: inp}.Offset(496)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32)) + LEAQ(Mem{Base: inp}.Offset(32*16), inp) + LEAQ(Mem{Base: oup}.Offset(32*16), oup) + SUBQ(U32(32*16), inl) + JMP(LabelRef("openAVX2MainLoop")) +} + +// Handle the various tail sizes efficiently +func openAVX2MainLoopDone() { + Label("openAVX2MainLoopDone") + Comment("Handle the various tail sizes efficiently") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("openAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("openAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("openAVX2Tail384")) + JMP(LabelRef("openAVX2Tail512")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes + +// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks +func openAVX2192() { + Label("openAVX2192") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VMOVDQA(DD0, DD2) + VMOVDQA(DD1, TT3) + MOVQ(U32(10), itr2) +} + +func openAVX2192InnerCipherLoop() { + Label("openAVX2192InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr2) + JNE(LabelRef("openAVX2192InnerCipherLoop")) + VPADDD(AA2, AA0, AA0) + VPADDD(AA2, AA1, AA1) + VPADDD(BB2, BB0, BB0) + VPADDD(BB2, BB1, BB1) + VPADDD(CC2, CC0, CC0) + VPADDD(CC2, CC1, CC1) + VPADDD(DD2, DD0, DD0) + VPADDD(TT3, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 192 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) +} + +func openAVX2ShortOpen() { + Label("openAVX2ShortOpen") + Comment("Hash") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openAVX2ShortOpenLoop() { + Label("openAVX2ShortOpenLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("openAVX2ShortTail32")) + SUBQ(Imm(32), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: inp}.Offset(2 * 8)) + polyMulAVX2() + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + + Comment("Shift stream left") + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + VMOVDQA(AA1, DD0) + VMOVDQA(BB1, AA1) + VMOVDQA(CC1, BB1) + VMOVDQA(DD1, CC1) + VMOVDQA(AA2, DD1) + VMOVDQA(BB2, AA2) + JMP(LabelRef("openAVX2ShortOpenLoop")) +} + +func openAVX2ShortTail32() { + Label("openAVX2ShortTail32") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("openAVX2ShortDone")) + + SUBQ(Imm(16), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0 * 8)) + polyMulAVX2() + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func openAVX2ShortDone() { + Label("openAVX2ShortDone") + VZEROUPPER() + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes + +// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks +func openAVX2320() { + Label("openAVX2320") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(BB0, TT1) + VMOVDQA(CC0, TT2) + VMOVDQA(DD0, TT3) + MOVQ(U32(10), itr2) +} + +func openAVX2320InnerCipherLoop() { + Label("openAVX2320InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr2) + JNE(LabelRef("openAVX2320InnerCipherLoop")) + + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, TT0) + VPADDD(TT0, AA0, AA0) + VPADDD(TT0, AA1, AA1) + VPADDD(TT0, AA2, AA2) + VPADDD(TT1, BB0, BB0) + VPADDD(TT1, BB1, BB1) + VPADDD(TT1, BB2, BB2) + VPADDD(TT2, CC0, CC0) + VPADDD(TT2, CC1, CC1) + VPADDD(TT2, CC2, CC2) + avx2IncMask := avx2IncMask_DATA() + VMOVDQA(avx2IncMask, TT0) + VPADDD(TT3, DD0, DD0) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD1, DD1) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD2, DD2) + + Comment("Clamp and store poly key") + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 320 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) + VPERM2I128(Imm(0x02), AA2, BB2, CC1) + VPERM2I128(Imm(0x02), CC2, DD2, DD1) + VPERM2I128(Imm(0x13), AA2, BB2, AA2) + VPERM2I128(Imm(0x13), CC2, DD2, BB2) + JMP(LabelRef("openAVX2ShortOpen")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext + +// Need to decrypt up to 128 bytes - prepare two blocks +func openAVX2Tail128() { + Label("openAVX2Tail128") + Comment("Need to decrypt up to 128 bytes - prepare two blocks") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA1) + VMOVDQA(state1StoreAVX2, BB1) + VMOVDQA(state2StoreAVX2, CC1) + VMOVDQA(ctr3StoreAVX2, DD1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD1, DD1) + VMOVDQA(DD1, DD0) + + XORQ(itr2, itr2) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) + TESTQ(itr1, itr1) + JE(LabelRef("openAVX2Tail128LoopB")) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail128LoopA() { + Label("openAVX2Tail128LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMulAVX2() +} + +func openAVX2Tail128LoopB() { + Label("openAVX2Tail128LoopB") + ADDQ(Imm(16), itr2) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail128LoopA")) + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openAVX2Tail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(DD0, DD1, DD1) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) +} + +func openAVX2TailLoop() { + Label("openAVX2TailLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("openAVX2Tail")) + SUBQ(Imm(32), inl) + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + JMP(LabelRef("openAVX2TailLoop")) +} + +func openAVX2Tail() { + Label("openAVX2Tail") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("openAVX2TailDone")) + SUBQ(Imm(16), inl) + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func openAVX2TailDone() { + Label("openAVX2TailDone") + VZEROUPPER() + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare four blocks +func openAVX2Tail256() { + Label("openAVX2Tail256") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) + + Comment("Compute the number of iterations that will hash data") + MOVQ(inl, tmpStoreAVX2) + MOVQ(inl, itr1) + SUBQ(Imm(128), itr1) + SHRQ(Imm(4), itr1) + MOVQ(U32(10), itr2) + CMPQ(itr1, Imm(10)) + CMOVQGT(itr2, itr1) + MOVQ(inp, inl) + XORQ(itr2, itr2) +} + +func openAVX2Tail256LoopA() { + Label("openAVX2Tail256LoopA") + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail256LoopB() { + Label("openAVX2Tail256LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + INCQ(itr2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail256LoopA")) + + CMPQ(itr2, Imm(10)) + JNE(LabelRef("openAVX2Tail256LoopB")) + + MOVQ(inl, itr2) + SUBQ(inp, inl) + MOVQ(inl, itr1) + MOVQ(tmpStoreAVX2, inl) +} + +// Hash the remainder of data (if any) +func openAVX2Tail256Hash() { + Label("openAVX2Tail256Hash") + ADDQ(Imm(16), itr1) + CMPQ(itr1, inl) + JGT(LabelRef("openAVX2Tail256HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + JMP(LabelRef("openAVX2Tail256Hash")) +} + +// Store 128 bytes safely, then go to store loop +func openAVX2Tail256HashEnd() { + Label("openAVX2Tail256HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, AA2) + VPERM2I128(Imm(0x02), CC0, DD0, BB2) + VPERM2I128(Imm(0x13), AA0, BB0, CC2) + VPERM2I128(Imm(0x13), CC0, DD0, DD2) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + + VPXOR(Mem{Base: inp}.Offset(0*32), AA2, AA2) + VPXOR(Mem{Base: inp}.Offset(1*32), BB2, BB2) + VPXOR(Mem{Base: inp}.Offset(2*32), CC2, CC2) + VPXOR(Mem{Base: inp}.Offset(3*32), DD2, DD2) + VMOVDQU(AA2, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(BB2, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(CC2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(DD2, Mem{Base: oup}.Offset(3*32)) + LEAQ(Mem{Base: inp}.Offset(4*32), inp) + LEAQ(Mem{Base: oup}.Offset(4*32), oup) + SUBQ(Imm(4*32), inl) + + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext + +// Need to decrypt up to 384 bytes - prepare six blocks +func openAVX2Tail384() { + Label("openAVX2Tail384") + Comment("Need to decrypt up to 384 bytes - prepare six blocks") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + + Comment("Compute the number of iterations that will hash two blocks of data") + MOVQ(inl, tmpStoreAVX2) + MOVQ(inl, itr1) + SUBQ(U32(256), itr1) + SHRQ(Imm(4), itr1) + ADDQ(Imm(6), itr1) + MOVQ(U32(10), itr2) + CMPQ(itr1, Imm(10)) + CMOVQGT(itr2, itr1) + MOVQ(inp, inl) + XORQ(itr2, itr2) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail384LoopB() { + Label("openAVX2Tail384LoopB") + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) +} + +func openAVX2Tail384LoopA() { + Label("openAVX2Tail384LoopA") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) + INCQ(itr2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail384LoopB")) + + CMPQ(itr2, Imm(10)) + JNE(LabelRef("openAVX2Tail384LoopA")) + + MOVQ(inl, itr2) + SUBQ(inp, inl) + MOVQ(inl, itr1) + MOVQ(tmpStoreAVX2, inl) +} + +func openAVX2Tail384Hash() { + Label("openAVX2Tail384Hash") + ADDQ(Imm(16), itr1) + CMPQ(itr1, inl) + JGT(LabelRef("openAVX2Tail384HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + JMP(LabelRef("openAVX2Tail384Hash")) +} + +// Store 256 bytes safely, then go to store loop +func openAVX2Tail384HashEnd() { + Label("openAVX2Tail384HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, TT0) + VPERM2I128(Imm(0x02), CC1, DD1, TT1) + VPERM2I128(Imm(0x13), AA1, BB1, TT2) + VPERM2I128(Imm(0x13), CC1, DD1, TT3) + VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + LEAQ(Mem{Base: inp}.Offset(8*32), inp) + LEAQ(Mem{Base: oup}.Offset(8*32), oup) + SUBQ(U32(8*32), inl) + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext + +func openAVX2Tail512() { + Label("openAVX2Tail512") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + XORQ(itr1, itr1) + MOVQ(inp, itr2) +} + +func openAVX2Tail512LoopB() { + Label("openAVX2Tail512LoopB") + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) +} + +func openAVX2Tail512LoopA() { + Label("openAVX2Tail512LoopA") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyAdd(Mem{Base: itr2}.Offset(0 * 8)) + polyMulAVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: itr2}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(4*8), itr2) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + INCQ(itr1) + CMPQ(itr1, Imm(4)) + JLT(LabelRef("openAVX2Tail512LoopB")) + + CMPQ(itr1, Imm(10)) + JNE(LabelRef("openAVX2Tail512LoopA")) + + MOVQ(inl, itr1) + SUBQ(U32(384), itr1) + ANDQ(I8(-16), itr1) +} + +func openAVX2Tail512HashLoop() { + Label("openAVX2Tail512HashLoop") + TESTQ(itr1, itr1) + JE(LabelRef("openAVX2Tail512HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + SUBQ(Imm(16), itr1) + JMP(LabelRef("openAVX2Tail512HashLoop")) +} + +func openAVX2Tail512HashEnd() { + Label("openAVX2Tail512HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + + LEAQ(Mem{Base: inp}.Offset(12*32), inp) + LEAQ(Mem{Base: oup}.Offset(12*32), oup) + SUBQ(U32(12*32), inl) + + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +// Implements the following function fignature: +// +// func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) +func chacha20Poly1305Seal() { + Implement("chacha20Poly1305Seal") + Attributes(0) + AllocLocal(288) + + MOVQ(RSP, RBP) + ADDQ(Imm(32), RBP) + ANDQ(I32(-32), RBP) + Load(Param("dst").Base(), oup) + Load(Param("key").Base(), keyp) + Load(Param("src").Base(), inp) + Load(Param("src").Len(), inl) + Load(Param("ad").Base(), adp) + + CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("chacha20Poly1305Seal_AVX2")) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSE128")) + + Comment("In the seal case - prepare the poly key + 3 blocks of stream in the first iteration") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + + Comment("Store state on stack for future use") + MOVO(B0, state1Store) + MOVO(C0, state2Store) + + Comment("Load state, increment counter blocks") + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + MOVQ(U32(10), itr2) + + sealSSEIntroLoop() + sealSSEMainLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 64 bytes of plaintext + sealSSETail64() + sealSSETail64LoopA() + sealSSETail64LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of plaintext + sealSSETail128() + sealSSETail128LoopA() + sealSSETail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 192 bytes of plaintext + sealSSETail192() + sealSSETail192LoopA() + sealSSETail192LoopB() + + // ---------------------------------------------------------------------------- + // Special seal optimization for buffers smaller than 129 bytes + sealSSE128() + sealSSE128SealHash() + sealSSE128Seal() + sealSSETail() + sealSSETailLoadLoop() + sealSSEFinalize() + + // ---------------------------------------------------------------------------- + // ------------------------- AVX2 Code ---------------------------------------- + chacha20Poly1305Seal_AVX2() + sealAVX2IntroLoop() + sealAVX2MainLoop() + sealAVX2InternalLoop() + sealAVX2InternalLoopStart() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 193 bytes + seal192AVX2() + sealAVX2192InnerCipherLoop() + sealAVX2ShortSeal() + sealAVX2SealHash() + sealAVX2ShortSealLoop() + sealAVX2ShortTail32() + sealAVX2ShortDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 321 bytes + seal320AVX2() + sealAVX2320InnerCipherLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + sealAVX2Tail128() + sealAVX2Tail128LoopA() + sealAVX2Tail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + sealAVX2Tail256() + sealAVX2Tail256LoopA() + sealAVX2Tail256LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 384 bytes of ciphertext + sealAVX2Tail384() + sealAVX2Tail384LoopA() + sealAVX2Tail384LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 512 bytes of ciphertext + sealAVX2Tail512() + sealAVX2Tail512LoopA() + sealAVX2Tail512LoopB() +} + +func sealSSEIntroLoop() { + Label("sealSSEIntroLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr2) + JNE(LabelRef("sealSSEIntroLoop")) + + Comment("Add in the state") + chacha20Constants := chacha20Constants_DATA() + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVO(A0, rStore) + MOVO(B0, sStore) + + Comment("Hash AAD") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(0*16)) + MOVOU(B1, Mem{Base: oup}.Offset(1*16)) + MOVOU(C1, Mem{Base: oup}.Offset(2*16)) + MOVOU(D1, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(4*16)) + MOVOU(B2, Mem{Base: oup}.Offset(5*16)) + MOVOU(C2, Mem{Base: oup}.Offset(6*16)) + MOVOU(D2, Mem{Base: oup}.Offset(7*16)) + + MOVQ(U32(128), itr1) + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + + MOVO(A3, A1) + MOVO(B3, B1) + MOVO(C3, C1) + MOVO(D3, D1) + + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSE128SealHash")) + + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A3) + PXOR(B0, B3) + PXOR(C0, C3) + PXOR(D0, D3) + MOVOU(A3, Mem{Base: oup}.Offset(8*16)) + MOVOU(B3, Mem{Base: oup}.Offset(9*16)) + MOVOU(C3, Mem{Base: oup}.Offset(10*16)) + MOVOU(D3, Mem{Base: oup}.Offset(11*16)) + + ADDQ(Imm(64), itr1) + SUBQ(Imm(64), inl) + LEAQ(Mem{Base: inp}.Offset(64), inp) + + MOVQ(U32(2), itr1) + MOVQ(U32(8), itr2) + + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSETail128")) + CMPQ(inl, Imm(192)) + JBE(LabelRef("sealSSETail192")) +} + +func sealSSEMainLoop() { + Label("sealSSEMainLoop") + Comment("Load state, increment counter blocks") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + + Label("sealSSEInnerLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyAdd(Mem{Base: oup}.Offset(0)) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + LEAQ(Mem{Base: oup}.Offset(2*8), oup) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + polyMulStage3() + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr2) + JGE(LabelRef("sealSSEInnerLoop")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(2*8), oup) + DECQ(itr1) + JG(LabelRef("sealSSEInnerLoop")) + + Comment("Add in the state") + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + MOVO(D3, tmpStore) + + Comment("Load - xor - store") + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVO(tmpStore, D3) + + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), A0) + MOVOU(Mem{Base: inp}.Offset(9*16), B0) + MOVOU(Mem{Base: inp}.Offset(10*16), C0) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + ADDQ(Imm(192), inp) + MOVQ(U32(192), itr1) + SUBQ(Imm(192), inl) + MOVO(A3, A1) + MOVO(B3, B1) + MOVO(C3, C1) + MOVO(D3, D1) + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSE128SealHash")) + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A3) + PXOR(B0, B3) + PXOR(C0, C3) + PXOR(D0, D3) + MOVOU(A3, Mem{Base: oup}.Offset(12*16)) + MOVOU(B3, Mem{Base: oup}.Offset(13*16)) + MOVOU(C3, Mem{Base: oup}.Offset(14*16)) + MOVOU(D3, Mem{Base: oup}.Offset(15*16)) + LEAQ(Mem{Base: inp}.Offset(64), inp) + SUBQ(Imm(64), inl) + MOVQ(U32(6), itr1) + MOVQ(U32(4), itr2) + CMPQ(inl, Imm(192)) + JG(LabelRef("sealSSEMainLoop")) + + MOVQ(inl, itr1) + TESTQ(inl, inl) + JE(LabelRef("sealSSE128SealHash")) + MOVQ(U32(6), itr1) + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSETail128")) + JMP(LabelRef("sealSSETail192")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of plaintext + +// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes +func sealSSETail64() { + Label("sealSSETail64") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A1) + MOVO(state1Store, B1) + MOVO(state2Store, C1) + MOVO(ctr3Store, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(D1, ctr0Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail64LoopA() { + Label("sealSSETail64LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail64LoopB() { + Label("sealSSETail64LoopB") + chachaQR(A1, B1, C1, D1, T1) + shiftB1Left() + shiftC1Left() + shiftD1Left() + chachaQR(A1, B1, C1, D1, T1) + shiftB1Right() + shiftC1Right() + shiftD1Right() + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + + DECQ(itr1) + JG(LabelRef("sealSSETail64LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail64LoopB")) + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A1) + PADDL(state1Store, B1) + PADDL(state2Store, C1) + PADDL(ctr0Store, D1) + + JMP(LabelRef("sealSSE128Seal")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of plaintext + +// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes +func sealSSETail128() { + Label("sealSSETail128") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail128LoopA() { + Label("sealSSETail128LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail128LoopB() { + Label("sealSSETail128LoopB") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + + DECQ(itr1) + JG(LabelRef("sealSSETail128LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(ctr0Store, D0) + PADDL(ctr1Store, D1) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A0) + PXOR(T1, B0) + PXOR(T2, C0) + PXOR(T3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + + MOVQ(U32(64), itr1) + LEAQ(Mem{Base: inp}.Offset(64), inp) + SUBQ(Imm(64), inl) + + JMP(LabelRef("sealSSE128SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of plaintext + +// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes +func sealSSETail192() { + Label("sealSSETail192") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(D2, ctr2Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail192LoopA() { + Label("sealSSETail192LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail192LoopB() { + Label("sealSSETail192LoopB") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + shiftB2Left() + shiftC2Left() + shiftD2Left() + + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + shiftB2Right() + shiftC2Right() + shiftD2Right() + + DECQ(itr1) + JG(LabelRef("sealSSETail192LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail192LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state1Store, B2) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(state2Store, C2) + PADDL(ctr0Store, D0) + PADDL(ctr1Store, D1) + PADDL(ctr2Store, D2) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A0) + PXOR(T1, B0) + PXOR(T2, C0) + PXOR(T3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), T0) + MOVOU(Mem{Base: inp}.Offset(5*16), T1) + MOVOU(Mem{Base: inp}.Offset(6*16), T2) + MOVOU(Mem{Base: inp}.Offset(7*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + + MOVO(A2, A1) + MOVO(B2, B1) + MOVO(C2, C1) + MOVO(D2, D1) + MOVQ(U32(128), itr1) + LEAQ(Mem{Base: inp}.Offset(128), inp) + SUBQ(Imm(128), inl) + + JMP(LabelRef("sealSSE128SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special seal optimization for buffers smaller than 129 bytes + +// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks +func sealSSE128() { + Label("sealSSE128") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(B0, T1) + MOVO(C0, T2) + MOVO(D1, T3) + MOVQ(U32(10), itr2) + + Label("sealSSE128InnerCipherLoop") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + DECQ(itr2) + JNE(LabelRef("sealSSE128InnerCipherLoop")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(T1, B0) + PADDL(T1, B1) + PADDL(T1, B2) + PADDL(T2, C1) + PADDL(T2, C2) + PADDL(T3, D1) + PADDL(sseIncMask, T3) + PADDL(T3, D2) + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVOU(A0, rStore) + MOVOU(B0, sStore) + + Comment("Hash") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +// itr1 holds the number of bytes encrypted but not yet hashed +func sealSSE128SealHash() { + Label("sealSSE128SealHash") + CMPQ(itr1, Imm(16)) + JB(LabelRef("sealSSE128Seal")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + + SUBQ(Imm(16), itr1) + ADDQ(Imm(16), oup) + + JMP(LabelRef("sealSSE128SealHash")) +} + +func sealSSE128Seal() { + Label("sealSSE128Seal") + CMPQ(inl, Imm(16)) + JB(LabelRef("sealSSETail")) + SUBQ(Imm(16), inl) + + Comment("Load for decryption") + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A1) + MOVOU(A1, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + + Comment("Extract for hashing") + MOVQ(A1, t0) + PSRLDQ(Imm(8), A1) + MOVQ(A1, t1) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Shift the stream \"left\"") + MOVO(B1, A1) + MOVO(C1, B1) + MOVO(D1, C1) + MOVO(A2, D1) + MOVO(B2, A2) + MOVO(C2, B2) + MOVO(D2, C2) + JMP(LabelRef("sealSSE128Seal")) +} + +func sealSSETail() { + Label("sealSSETail") + TESTQ(inl, inl) + JE(LabelRef("sealSSEFinalize")) + + Comment("We can only load the PT one byte at a time to avoid read after end of buffer") + MOVQ(inl, itr2) + SHLQ(Imm(4), itr2) + andMask := andMask_DATA() + LEAQ(andMask, t0) + MOVQ(inl, itr1) + LEAQ(Mem{Base: inp, Index: inl, Scale: 1}.Offset(-1), inp) + XORQ(t2, t2) + XORQ(t3, t3) + XORQ(RAX, RAX) +} + +func sealSSETailLoadLoop() { + Label("sealSSETailLoadLoop") + SHLQ(Imm(8), t2, t3) + SHLQ(Imm(8), t2) + // Hack to get Avo to emit: + // MOVB (inp), AX + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: inp}, AX}}) + XORQ(RAX, t2) + LEAQ(Mem{Base: inp}.Offset(-1), inp) + DECQ(itr1) + JNE(LabelRef("sealSSETailLoadLoop")) + MOVQ(t2, tmpStore.Offset(0)) + MOVQ(t3, tmpStore.Offset(8)) + PXOR(tmpStore.Offset(0), A1) + MOVOU(A1, Mem{Base: oup}) + MOVOU(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0) + PAND(T0, A1) + MOVQ(A1, t0) + PSRLDQ(Imm(8), A1) + MOVQ(A1, t1) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + + ADDQ(inl, oup) +} + +func sealSSEFinalize() { + Label("sealSSEFinalize") + Comment("Hash in the buffer lengths") + ADDQ(NewParamAddr("ad_len", 80), acc0) + ADCQ(NewParamAddr("src_len", 56), acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Final reduce") + MOVQ(acc0, t0) + MOVQ(acc1, t1) + MOVQ(acc2, t2) + SUBQ(I8(-5), acc0) + SBBQ(I8(-1), acc1) + SBBQ(Imm(3), acc2) + CMOVQCS(t0, acc0) + CMOVQCS(t1, acc1) + CMOVQCS(t2, acc2) + + Comment("Add in the \"s\" part of the key") + ADDQ(sStore.Offset(0), acc0) + ADCQ(sStore.Offset(8), acc1) + + Comment("Finally store the tag at the end of the message") + MOVQ(acc0, Mem{Base: oup}.Offset(0*8)) + MOVQ(acc1, Mem{Base: oup}.Offset(1*8)) + RET() +} + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- + +func chacha20Poly1305Seal_AVX2() { + Label("chacha20Poly1305Seal_AVX2") + VZEROUPPER() + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VBROADCASTI128_16_R8_YMM14() + VBROADCASTI128_32_R8_YMM12() + VBROADCASTI128_48_R8_YMM4() + avx2InitMask := avx2InitMask_DATA() + VPADDD(avx2InitMask, DD0, DD0) + + Comment("Special optimizations, for very short buffers") + CMPQ(inl, U32(192)) + JBE(LabelRef("seal192AVX2")) + CMPQ(inl, U32(320)) + JBE(LabelRef("seal320AVX2")) + + Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream") + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(BB0, state1StoreAVX2) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(CC0, state2StoreAVX2) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, ctr0StoreAVX2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD1, ctr1StoreAVX2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + MOVQ(U32(10), itr2) +} + +func sealAVX2IntroLoop() { + Label("sealAVX2IntroLoop") + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD3, DD3, DD3) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD3, DD3, DD3) + DECQ(itr2) + JNE(LabelRef("sealAVX2IntroLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPERM2I128(Imm(0x02), AA0, BB0, DD0) + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, DD0, DD0) + VMOVDQA(DD0, rsStoreAVX2) + + Comment("Hash AD") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + + Comment("Can store at least 320 bytes") + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), CC0, CC0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(1*32)) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(2*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(3*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(4*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(5*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(3*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(5*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(6*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(7*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(8*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(9*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(7*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(9*32)) + + MOVQ(U32(320), itr1) + SUBQ(U32(320), inl) + LEAQ(Mem{Base: inp}.Offset(320), inp) + + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), CC3, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), CC3, DD3, DD0) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2SealHash")) + + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(2*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(3*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(11*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(13*32)) + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + + MOVQ(U32(8), itr1) + MOVQ(U32(2), itr2) + + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("sealAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("sealAVX2Tail384")) + CMPQ(inl, U32(512)) + JBE(LabelRef("sealAVX2Tail512")) + + Comment("We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop") + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD3, DD3, DD3) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + + SUBQ(Imm(16), oup) // Adjust the pointer + MOVQ(U32(9), itr1) + JMP(LabelRef("sealAVX2InternalLoopStart")) +} + +// Load state, increment counter blocks, store the incremented counters +func sealAVX2MainLoop() { + Label("sealAVX2MainLoop") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + MOVQ(U32(10), itr1) +} + +func sealAVX2InternalLoop() { + Label("sealAVX2InternalLoop") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage1_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulStage2_AVX2() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyMulStage3_AVX2() + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() +} + +func sealAVX2InternalLoopStart() { + Label("sealAVX2InternalLoopStart") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage1_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage2_AVX2() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage3_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulReduceStage() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: oup}.Offset(4 * 8)) + LEAQ(Mem{Base: oup}.Offset(6*8), oup) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage1_AVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + polyMulStage2_AVX2() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage3_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + DECQ(itr1) + JNE(LabelRef("sealAVX2InternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + + Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(4*8), oup) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + Comment("and here") + polyAdd(Mem{Base: oup}.Offset(-2 * 8)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32)) + LEAQ(Mem{Base: inp}.Offset(32*16), inp) + SUBQ(U32(32*16), inl) + CMPQ(inl, U32(512)) + JG(LabelRef("sealAVX2MainLoop")) + + Comment("Tail can only hash 480 bytes") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(32), oup) + + MOVQ(U32(10), itr1) + MOVQ(U32(0), itr2) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("sealAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("sealAVX2Tail384")) + JMP(LabelRef("sealAVX2Tail512")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes + +// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks +func seal192AVX2() { + Label("seal192AVX2") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VMOVDQA(DD0, DD2) + VMOVDQA(DD1, TT3) + MOVQ(U32(10), itr2) +} + +func sealAVX2192InnerCipherLoop() { + Label("sealAVX2192InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr2) + JNE(LabelRef("sealAVX2192InnerCipherLoop")) + VPADDD(AA2, AA0, AA0) + VPADDD(AA2, AA1, AA1) + VPADDD(BB2, BB0, BB0) + VPADDD(BB2, BB1, BB1) + VPADDD(CC2, CC0, CC0) + VPADDD(CC2, CC1, CC1) + VPADDD(DD2, DD0, DD0) + VPADDD(TT3, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 192 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) +} + +func sealAVX2ShortSeal() { + Label("sealAVX2ShortSeal") + Comment("Hash aad") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +func sealAVX2SealHash() { + Label("sealAVX2SealHash") + Comment("itr1 holds the number of bytes encrypted but not yet hashed") + CMPQ(itr1, Imm(16)) + JB(LabelRef("sealAVX2ShortSealLoop")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + SUBQ(Imm(16), itr1) + ADDQ(Imm(16), oup) + JMP(LabelRef("sealAVX2SealHash")) +} + +func sealAVX2ShortSealLoop() { + Label("sealAVX2ShortSealLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("sealAVX2ShortTail32")) + SUBQ(Imm(32), inl) + + Comment("Load for encryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + + Comment("Now can hash") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + + Comment("Shift stream left") + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + VMOVDQA(AA1, DD0) + VMOVDQA(BB1, AA1) + VMOVDQA(CC1, BB1) + VMOVDQA(DD1, CC1) + VMOVDQA(AA2, DD1) + VMOVDQA(BB2, AA2) + JMP(LabelRef("sealAVX2ShortSealLoop")) +} + +func sealAVX2ShortTail32() { + Label("sealAVX2ShortTail32") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("sealAVX2ShortDone")) + + SUBQ(Imm(16), inl) + + Comment("Load for encryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + + Comment("Hash") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func sealAVX2ShortDone() { + Label("sealAVX2ShortDone") + VZEROUPPER() + JMP(LabelRef("sealSSETail")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes + +// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks +func seal320AVX2() { + Label("seal320AVX2") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(BB0, TT1) + VMOVDQA(CC0, TT2) + VMOVDQA(DD0, TT3) + MOVQ(U32(10), itr2) +} + +func sealAVX2320InnerCipherLoop() { + Label("sealAVX2320InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr2) + JNE(LabelRef("sealAVX2320InnerCipherLoop")) + + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, TT0) + VPADDD(TT0, AA0, AA0) + VPADDD(TT0, AA1, AA1) + VPADDD(TT0, AA2, AA2) + VPADDD(TT1, BB0, BB0) + VPADDD(TT1, BB1, BB1) + VPADDD(TT1, BB2, BB2) + VPADDD(TT2, CC0, CC0) + VPADDD(TT2, CC1, CC1) + VPADDD(TT2, CC2, CC2) + avx2IncMask := avx2IncMask_DATA() + VMOVDQA(avx2IncMask, TT0) + VPADDD(TT3, DD0, DD0) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD1, DD1) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD2, DD2) + + Comment("Clamp and store poly key") + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 320 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) + VPERM2I128(Imm(0x02), AA2, BB2, CC1) + VPERM2I128(Imm(0x02), CC2, DD2, DD1) + VPERM2I128(Imm(0x13), AA2, BB2, AA2) + VPERM2I128(Imm(0x13), CC2, DD2, BB2) + JMP(LabelRef("sealAVX2ShortSeal")) +} + +// Need to decrypt up to 128 bytes - prepare two blocks: +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed. +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed. +func sealAVX2Tail128() { + Label("sealAVX2Tail128") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VMOVDQA(DD0, DD1) +} + +func sealAVX2Tail128LoopA() { + Label("sealAVX2Tail128LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail128LoopB() { + Label("sealAVX2Tail128LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail128LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA1) + VPADDD(state1StoreAVX2, BB0, BB1) + VPADDD(state2StoreAVX2, CC0, CC1) + VPADDD(DD1, DD0, DD1) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + JMP(LabelRef("sealAVX2ShortSealLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail256() { + Label("sealAVX2Tail256") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(chacha20Constants, AA1) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(state1StoreAVX2, BB1) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(state2StoreAVX2, CC1) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) +} + +func sealAVX2Tail256LoopA() { + Label("sealAVX2Tail256LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +// LIne 2493 +func sealAVX2Tail256LoopB() { + Label("sealAVX2Tail256LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail256LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail256LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + MOVQ(U32(128), itr1) + LEAQ(Mem{Base: inp}.Offset(128), inp) + SUBQ(Imm(128), inl) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext + +// Need to decrypt up to 384 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail384() { + Label("sealAVX2Tail384") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) + VMOVDQA(DD2, TT3) +} + +func sealAVX2Tail384LoopA() { + Label("sealAVX2Tail384LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail384LoopB() { + Label("sealAVX2Tail384LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail384LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail384LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPADDD(TT3, DD2, DD2) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, TT0) + VPERM2I128(Imm(0x02), CC1, DD1, TT1) + VPERM2I128(Imm(0x13), AA1, BB1, TT2) + VPERM2I128(Imm(0x13), CC1, DD1, TT3) + VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32)) + MOVQ(U32(256), itr1) + LEAQ(Mem{Base: inp}.Offset(256), inp) + SUBQ(U32(256), inl) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext + +// Need to decrypt up to 512 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail512() { + Label("sealAVX2Tail512") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) +} + +func sealAVX2Tail512LoopA() { + Label("sealAVX2Tail512LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail512LoopB() { + Label("sealAVX2Tail512LoopB") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(4*8), oup) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + + DECQ(itr1) + JG(LabelRef("sealAVX2Tail512LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail512LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VPERM2I128(Imm(0x02), CC0, DD0, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(1*32)) + VPERM2I128(Imm(0x13), AA0, BB0, CC3) + VPXOR(Mem{Base: inp}.Offset(2*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(2*32)) + VPERM2I128(Imm(0x13), CC0, DD0, CC3) + VPXOR(Mem{Base: inp}.Offset(3*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(3*32)) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + + MOVQ(U32(384), itr1) + LEAQ(Mem{Base: inp}.Offset(384), inp) + SUBQ(U32(384), inl) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var ( + // Pointers for memoizing DATA section symbols + chacha20Constants_DATA_ptr, + rol16_DATA_ptr, + rol8_DATA_ptr, + sseIncMask_DATA_ptr, + avx2IncMask_DATA_ptr, + avx2InitMask_DATA_ptr, + polyClampMask_DATA_ptr, + andMask_DATA_ptr *Mem +) + +var nothingUpMySleeve = [8]uint32{ + 0x61707865, + 0x3320646e, + 0x79622d32, + 0x6b206574, + 0x61707865, + 0x3320646e, + 0x79622d32, + 0x6b206574, +} + +// ChaCha20 constants +func chacha20Constants_DATA() Mem { + if chacha20Constants_DATA_ptr != nil { + return *chacha20Constants_DATA_ptr + } + + chacha20Constants := GLOBL(ThatPeskyUnicodeDot+"chacha20Constants", NOPTR|RODATA) + chacha20Constants_DATA_ptr = &chacha20Constants + for i, v := range nothingUpMySleeve { + DATA(i*4, U32(v)) + } + return chacha20Constants +} + +var rol16Consts = [4]uint64{ + 0x0504070601000302, + 0x0D0C0F0E09080B0A, + 0x0504070601000302, + 0x0D0C0F0E09080B0A, +} + +// <<< 16 with PSHUFB +func rol16_DATA() Mem { + if rol16_DATA_ptr != nil { + return *rol16_DATA_ptr + } + + rol16 := GLOBL(ThatPeskyUnicodeDot+"rol16", NOPTR|RODATA) + rol16_DATA_ptr = &rol16 + for i, v := range rol16Consts { + DATA(i*8, U64(v)) + } + return rol16 +} + +var rol8Consts = [4]uint64{ + 0x0605040702010003, + 0x0E0D0C0F0A09080B, + 0x0605040702010003, + 0x0E0D0C0F0A09080B, +} + +// <<< 8 with PSHUFB +func rol8_DATA() Mem { + if rol8_DATA_ptr != nil { + return *rol8_DATA_ptr + } + + rol8 := GLOBL(ThatPeskyUnicodeDot+"rol8", NOPTR|RODATA) + rol8_DATA_ptr = &rol8 + for i, v := range rol8Consts { + DATA(i*8, U64(v)) + } + return rol8 +} + +var avx2InitMaskConsts = [4]uint64{ + 0x0, + 0x0, + 0x1, + 0x0, +} + +func avx2InitMask_DATA() Mem { + if avx2InitMask_DATA_ptr != nil { + return *avx2InitMask_DATA_ptr + } + + avx2InitMask := GLOBL(ThatPeskyUnicodeDot+"avx2InitMask", NOPTR|RODATA) + avx2InitMask_DATA_ptr = &avx2InitMask + for i, v := range avx2InitMaskConsts { + DATA(i*8, U64(v)) + } + return avx2InitMask +} + +var avx2IncMaskConsts = [4]uint64{ + 0x2, + 0x0, + 0x2, + 0x0, +} + +func avx2IncMask_DATA() Mem { + if avx2IncMask_DATA_ptr != nil { + return *avx2IncMask_DATA_ptr + } + + avx2IncMask := GLOBL(ThatPeskyUnicodeDot+"avx2IncMask", NOPTR|RODATA) + avx2IncMask_DATA_ptr = &avx2IncMask + for i, v := range avx2IncMaskConsts { + DATA(i*8, U64(v)) + } + return avx2IncMask +} + +var polyClampMaskConsts = [4]uint64{ + 0x0FFFFFFC0FFFFFFF, + 0x0FFFFFFC0FFFFFFC, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, +} + +// Poly1305 key clamp +func polyClampMask_DATA() Mem { + if polyClampMask_DATA_ptr != nil { + return *polyClampMask_DATA_ptr + } + + polyClampMask := GLOBL(ThatPeskyUnicodeDot+"polyClampMask", NOPTR|RODATA) + polyClampMask_DATA_ptr = &polyClampMask + for i, v := range polyClampMaskConsts { + DATA(i*8, U64(v)) + } + return polyClampMask +} + +var sseIncMaskConsts = [2]uint64{ + 0x1, + 0x0, +} + +func sseIncMask_DATA() Mem { + if sseIncMask_DATA_ptr != nil { + return *sseIncMask_DATA_ptr + } + + sseIncMask := GLOBL(ThatPeskyUnicodeDot+"sseIncMask", NOPTR|RODATA) + sseIncMask_DATA_ptr = &sseIncMask + for i, v := range sseIncMaskConsts { + DATA(i*8, U64(v)) + } + return sseIncMask +} + +var andMaskConsts = [30]uint64{ + 0x00000000000000ff, + 0x0000000000000000, + 0x000000000000ffff, + 0x0000000000000000, + 0x0000000000ffffff, + 0x0000000000000000, + 0x00000000ffffffff, + 0x0000000000000000, + 0x000000ffffffffff, + 0x0000000000000000, + 0x0000ffffffffffff, + 0x0000000000000000, + 0x00ffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x00000000000000ff, + 0xffffffffffffffff, + 0x000000000000ffff, + 0xffffffffffffffff, + 0x0000000000ffffff, + 0xffffffffffffffff, + 0x00000000ffffffff, + 0xffffffffffffffff, + 0x000000ffffffffff, + 0xffffffffffffffff, + 0x0000ffffffffffff, + 0xffffffffffffffff, + 0x00ffffffffffffff, +} + +func andMask_DATA() Mem { + if andMask_DATA_ptr != nil { + return *andMask_DATA_ptr + } + + andMask := GLOBL(ThatPeskyUnicodeDot+"andMask", NOPTR|RODATA) + andMask_DATA_ptr = &andMask + for i, v := range andMaskConsts { + DATA(i*8, U64(v)) + } + return andMask +} + +// removePeskyUnicodeDot strips the dot from the relevant TEXT directives such that they +// can exist as internal assembly functions +// +// Avo v0.6.0 does not support the generation of internal assembly functions. Go's unicode +// dot tells the compiler to link a TEXT symbol to a function in the current Go package +// (or another package if specified). Avo unconditionally prepends the unicode dot to all +// TEXT symbols, making it impossible to emit an internal function without this hack. +// +// There is a pending PR to add internal functions to Avo: +// https://github.com/mmcloughlin/avo/pull/443 +// +// If merged it should allow the usage of InternalFunction("NAME") for the specified functions +func removePeskyUnicodeDot(internalFunctions []string, target string) { + bytes, err := os.ReadFile(target) + if err != nil { + panic(err) + } + + content := string(bytes) + + for _, from := range internalFunctions { + to := strings.ReplaceAll(from, ThatPeskyUnicodeDot, "") + content = strings.ReplaceAll(content, from, to) + } + + err = os.WriteFile(target, []byte(content), 0644) + if err != nil { + panic(err) + } +} diff --git a/chacha20poly1305/_asm/go.mod b/chacha20poly1305/_asm/go.mod new file mode 100644 index 0000000000..957baf2a64 --- /dev/null +++ b/chacha20poly1305/_asm/go.mod @@ -0,0 +1,15 @@ +module chacha20poly1305/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/chacha20poly1305/_asm/go.sum b/chacha20poly1305/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/chacha20poly1305/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s index 731d2ac6db..fd5ee845f9 100644 --- a/chacha20poly1305/chacha20poly1305_amd64.s +++ b/chacha20poly1305/chacha20poly1305_amd64.s @@ -1,2715 +1,9762 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. +// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. //go:build gc && !purego #include "textflag.h" -// General register allocation -#define oup DI -#define inp SI -#define inl BX -#define adp CX // free to reuse, after we hash the additional data -#define keyp R8 // free to reuse, when we copy the key to stack -#define itr2 R9 // general iterator -#define itr1 CX // general iterator -#define acc0 R10 -#define acc1 R11 -#define acc2 R12 -#define t0 R13 -#define t1 R14 -#define t2 R15 -#define t3 R8 -// Register and stack allocation for the SSE code -#define rStore (0*16)(BP) -#define sStore (1*16)(BP) -#define state1Store (2*16)(BP) -#define state2Store (3*16)(BP) -#define tmpStore (4*16)(BP) -#define ctr0Store (5*16)(BP) -#define ctr1Store (6*16)(BP) -#define ctr2Store (7*16)(BP) -#define ctr3Store (8*16)(BP) -#define A0 X0 -#define A1 X1 -#define A2 X2 -#define B0 X3 -#define B1 X4 -#define B2 X5 -#define C0 X6 -#define C1 X7 -#define C2 X8 -#define D0 X9 -#define D1 X10 -#define D2 X11 -#define T0 X12 -#define T1 X13 -#define T2 X14 -#define T3 X15 -#define A3 T0 -#define B3 T1 -#define C3 T2 -#define D3 T3 -// Register and stack allocation for the AVX2 code -#define rsStoreAVX2 (0*32)(BP) -#define state1StoreAVX2 (1*32)(BP) -#define state2StoreAVX2 (2*32)(BP) -#define ctr0StoreAVX2 (3*32)(BP) -#define ctr1StoreAVX2 (4*32)(BP) -#define ctr2StoreAVX2 (5*32)(BP) -#define ctr3StoreAVX2 (6*32)(BP) -#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack -#define AA0 Y0 -#define AA1 Y5 -#define AA2 Y6 -#define AA3 Y7 -#define BB0 Y14 -#define BB1 Y9 -#define BB2 Y10 -#define BB3 Y11 -#define CC0 Y12 -#define CC1 Y13 -#define CC2 Y8 -#define CC3 Y15 -#define DD0 Y4 -#define DD1 Y1 -#define DD2 Y2 -#define DD3 Y3 -#define TT0 DD3 -#define TT1 AA3 -#define TT2 BB3 -#define TT3 CC3 -// ChaCha20 constants -DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 -// <<< 16 with PSHUFB -DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -// <<< 8 with PSHUFB -DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B - -DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 -DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 - -DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 -DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 -// Poly1305 key clamp -DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA ·sseIncMask<>+0x00(SB)/8, $0x1 -DATA ·sseIncMask<>+0x08(SB)/8, $0x0 -// To load/store the last < 16 bytes in a buffer -DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 -GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 -GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 -// No PALIGNR in Go ASM yet (but VPALIGNR is present). -#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 -#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 -#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 -#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 -#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 -#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 -#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 -#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 -#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 -#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 -#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 -#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 -#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 -#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 -#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 -#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 -#define shiftC0Right shiftC0Left -#define shiftC1Right shiftC1Left -#define shiftC2Right shiftC2Left -#define shiftC3Right shiftC3Left -#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 -#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 -#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 -#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 - -// Some macros - -// ROL rotates the uint32s in register R left by N bits, using temporary T. -#define ROL(N, R, T) \ - MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R - -// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL16(R, T) PSHUFB ·rol16<>(SB), R -#else -#define ROL16(R, T) ROL(16, R, T) -#endif - -// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL8(R, T) PSHUFB ·rol8<>(SB), R -#else -#define ROL8(R, T) ROL(8, R, T) -#endif - -#define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; ROL16(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; ROL8(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B - -#define chachaQR_AVX2(A, B, C, D, T) \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B - -#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 -#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX -#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 -#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 - -#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 -#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 - -#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage -#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage -// ---------------------------------------------------------------------------- + +// func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // adp points to beginning of additional data - // itr2 holds ad length - XORQ acc0, acc0 - XORQ acc1, acc1 - XORQ acc2, acc2 - CMPQ itr2, $13 - JNE hashADLoop - -openFastTLSAD: - // Special treatment for the TLS case of 13 bytes - MOVQ (adp), acc0 - MOVQ 5(adp), acc1 - SHRQ $24, acc1 - MOVQ $1, acc2 - polyMul + // Hack: Must declare #define macros inside of a function due to Avo constraints + // ROL rotates the uint32s in register R left by N bits, using temporary T. + #define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R + + // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif + + // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif + XORQ R10, R10 + XORQ R11, R11 + XORQ R12, R12 + CMPQ R9, $0x0d + JNE hashADLoop + MOVQ (CX), R10 + MOVQ 5(CX), R11 + SHRQ $0x18, R11 + MOVQ $0x00000001, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 RET hashADLoop: // Hash in 16 byte chunks - CMPQ itr2, $16 - JB hashADTail - polyAdd(0(adp)) - LEAQ (1*16)(adp), adp - SUBQ $16, itr2 - polyMul - JMP hashADLoop + CMPQ R9, $0x10 + JB hashADTail + ADDQ (CX), R10 + ADCQ 8(CX), R11 + ADCQ $0x01, R12 + LEAQ 16(CX), CX + SUBQ $0x10, R9 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP hashADLoop hashADTail: - CMPQ itr2, $0 + CMPQ R9, $0x00 JE hashADDone // Hash last < 16 byte tail - XORQ t0, t0 - XORQ t1, t1 - XORQ t2, t2 - ADDQ itr2, adp + XORQ R13, R13 + XORQ R14, R14 + XORQ R15, R15 + ADDQ R9, CX hashADTailLoop: - SHLQ $8, t0, t1 - SHLQ $8, t0 - MOVB -1(adp), t2 - XORQ t2, t0 - DECQ adp - DECQ itr2 - JNE hashADTailLoop - -hashADTailFinish: - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - // Finished AD + SHLQ $0x08, R13, R14 + SHLQ $0x08, R13 + MOVB -1(CX), R15 + XORQ R15, R13 + DECQ CX + DECQ R9 + JNE hashADTailLoop + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + hashADDone: RET -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Open(dst, key, src, ad []byte) bool -TEXT ·chacha20Poly1305Open(SB), 0, $288-97 +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX // Check for AVX2 support - CMPB ·useAVX2(SB), $1 + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE openSSE128 // About 16% faster + CMPQ BX, $0x80 + JBE openSSE128 // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 - MOVO D0, T1 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X9, X13 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store - MOVO D0, ctr3Store - MOVQ $10, itr2 + MOVO X3, 32(BP) + MOVO X6, 48(BP) + MOVO X9, 128(BP) + MOVQ $0x0000000a, R9 openSSEPreparePolyKey: - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - DECQ itr2 - JNE openSSEPreparePolyKey + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + DECQ R9 + JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore; MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSEMainLoop: - CMPQ inl, $256 + CMPQ BX, $0x00000100 JB openSSEMainLoopDone // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $4, itr1 - MOVQ inp, itr2 + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash + // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $0x00000004, CX + MOVQ SI, R9 openSSEInternalLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(itr2)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(itr2), itr2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr1 - JGE openSSEInternalLoop - - polyAdd(0(itr2)) - polyMul - LEAQ (2*8)(itr2), itr2 - - CMPQ itr1, $-6 - JG openSSEInternalLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(R9), R9 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ CX + JGE openSSEInternalLoop + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + CMPQ CX, $-6 + JG openSSEInternalLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Load - xor - store - MOVO D3, tmpStore - MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) - MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) - MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) - MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) - MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) - MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) - MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) - MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) - MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) - MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) - MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) - MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) - MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) - MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) - LEAQ 256(inp), inp - LEAQ 256(oup), oup - SUBQ $256, inl + MOVO X15, 64(BP) + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU X0, (DI) + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU X3, 16(DI) + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU X6, 32(DI) + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X9, 48(DI) + MOVOU 64(SI), X9 + PXOR X9, X1 + MOVOU X1, 64(DI) + MOVOU 80(SI), X9 + PXOR X9, X4 + MOVOU X4, 80(DI) + MOVOU 96(SI), X9 + PXOR X9, X7 + MOVOU X7, 96(DI) + MOVOU 112(SI), X9 + PXOR X9, X10 + MOVOU X10, 112(DI) + MOVOU 128(SI), X9 + PXOR X9, X2 + MOVOU X2, 128(DI) + MOVOU 144(SI), X9 + PXOR X9, X5 + MOVOU X5, 144(DI) + MOVOU 160(SI), X9 + PXOR X9, X8 + MOVOU X8, 160(DI) + MOVOU 176(SI), X9 + PXOR X9, X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X9 + PXOR X9, X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X9 + PXOR X9, X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X9 + PXOR X9, X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X9 + PXOR 64(BP), X9 + MOVOU X9, 240(DI) + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $64 + CMPQ BX, $0x40 JBE openSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openSSETail128 - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX - MOVQ $1, DX - XORQ (0*8)(inp), acc0 - XORQ (1*8)(inp), acc1 - ORQ acc1, acc0 + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 129 bytes openSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 openSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE openSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore; MOVOU B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSE128Open: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail16 - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0(inp)) + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 // Load for decryption - MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - polyMul + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP openSSE128Open openSSETail16: - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVOU (inp), T0 - ADDQ inl, inp - PAND -16(t0)(itr2*1), T0 - MOVO T0, 0+tmpStore - MOVQ T0, t0 - MOVQ 8+tmpStore, t1 - PXOR A1, T0 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: - MOVQ T0, t3 - MOVB t3, (oup) - PSRLDQ $1, T0 - INCQ oup - DECQ inl + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX JNE openSSETail16Store - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 JMP openSSEFinalize -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of ciphertext openSSETail64: - // Need to decrypt up to 64 bytes - prepare single block - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - XORQ itr2, itr2 - MOVQ inl, itr1 - CMPQ itr1, $16 - JB openSSETail64LoopB + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + XORQ R9, R9 + MOVQ BX, CX + CMPQ CX, $0x10 + JB openSSETail64LoopB openSSETail64LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul - SUBQ $16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX openSSETail64LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - - CMPQ itr1, $16 - JAE openSSETail64LoopA - - CMPQ itr2, $160 - JNE openSSETail64LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + CMPQ CX, $0x10 + JAE openSSETail64LoopA + CMPQ R9, $0xa0 + JNE openSSETail64LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 + PADDL 48(BP), X6 + PADDL 80(BP), X9 openSSETail64DecLoop: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail64DecLoopDone - SUBQ $16, inl - MOVOU (inp), T0 - PXOR T0, A0 - MOVOU A0, (oup) - LEAQ 16(inp), inp - LEAQ 16(oup), oup - MOVO B0, A0 - MOVO C0, B0 - MOVO D0, C0 + SUBQ $0x10, BX + MOVOU (SI), X12 + PXOR X12, X0 + MOVOU X0, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVO X3, X0 + MOVO X6, X3 + MOVO X9, X6 JMP openSSETail64DecLoop openSSETail64DecLoopDone: - MOVO A0, A1 + MOVO X0, X1 JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openSSETail128: - // Need to decrypt up to 128 bytes - prepare two blocks - MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 96(BP) + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX openSSETail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSETail128LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - CMPQ itr2, itr1 - JB openSSETail128LoopA - - CMPQ itr2, $160 - JNE openSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr1Store, D0; PADDL ctr0Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - - SUBQ $64, inl - LEAQ 64(inp), inp - LEAQ 64(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of ciphertext + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + CMPQ R9, CX + JB openSSETail128LoopA + CMPQ R9, $0xa0 + JNE openSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 96(BP), X9 + PADDL 80(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + SUBQ $0x40, BX + LEAQ 64(SI), SI + LEAQ 64(DI), DI + JMP openSSETail64DecLoop + openSSETail192: - // Need to decrypt up to 192 bytes - prepare three blocks - MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store - MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store - - MOVQ inl, itr1 - MOVQ $160, itr2 - CMPQ itr1, $160 - CMOVQGT itr2, itr1 - ANDQ $-16, itr1 - XORQ itr2, itr2 + MOVO ·chacha20Constants<>+0(SB), X2 + MOVO 32(BP), X5 + MOVO 48(BP), X8 + MOVO 128(BP), X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 80(BP) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 112(BP) + MOVQ BX, CX + MOVQ $0x000000a0, R9 + CMPQ CX, $0xa0 + CMOVQGT R9, CX + ANDQ $-16, CX + XORQ R9, R9 openSSLTail192LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - CMPQ itr2, itr1 - JB openSSLTail192LoopA - - CMPQ itr2, $160 - JNE openSSLTail192LoopB - - CMPQ inl, $176 - JB openSSLTail192Store - - polyAdd(160(inp)) - polyMul - - CMPQ inl, $192 - JB openSSLTail192Store - - polyAdd(176(inp)) - polyMul + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + CMPQ R9, CX + JB openSSLTail192LoopA + CMPQ R9, $0xa0 + JNE openSSLTail192LoopB + CMPQ BX, $0xb0 + JB openSSLTail192Store + ADDQ 160(SI), R10 + ADCQ 168(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + CMPQ BX, $0xc0 + JB openSSLTail192Store + ADDQ 176(SI), R10 + ADCQ 184(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192Store: - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 - MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) - - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - SUBQ $128, inl - LEAQ 128(inp), inp - LEAQ 128(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 112(BP), X9 + PADDL 96(BP), X10 + PADDL 80(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X2 + PXOR X13, X5 + PXOR X14, X8 + PXOR X15, X11 + MOVOU X2, (DI) + MOVOU X5, 16(DI) + MOVOU X8, 32(DI) + MOVOU X11, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + LEAQ 128(DI), DI + JMP openSSETail64DecLoop + openSSETail256: - // Need to decrypt up to 256 bytes - prepare four blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - XORQ itr2, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + XORQ R9, R9 openSSETail256Loop: - // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication - polyAdd(0(inp)(itr2*1)) - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulStage3 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - ADDQ $2*8, itr2 - CMPQ itr2, $160 - JB openSSETail256Loop - MOVQ inl, itr1 - ANDQ $-16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + ADDQ $0x10, R9 + CMPQ R9, $0xa0 + JB openSSETail256Loop + MOVQ BX, CX + ANDQ $-16, CX openSSETail256HashLoop: - polyAdd(0(inp)(itr2*1)) - polyMul - ADDQ $2*8, itr2 - CMPQ itr2, itr1 - JB openSSETail256HashLoop + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, R9 + CMPQ R9, CX + JB openSSETail256HashLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - LEAQ 192(inp), inp - LEAQ 192(oup), oup - SUBQ $192, inl - MOVO A3, A0 - MOVO B3, B0 - MOVO C3, C0 - MOVO tmpStore, D0 - - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + LEAQ 192(SI), SI + LEAQ 192(DI), DI + SUBQ $0xc0, BX + MOVO X12, X0 + MOVO X13, X3 + MOVO X14, X6 + MOVO 64(BP), X9 + JMP openSSETail64DecLoop + chacha20Poly1305Open_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openAVX2192 - CMPQ inl, $320 + CMPQ BX, $0x00000140 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, state2StoreAVX2 - VMOVDQA DD0, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 openAVX2PreparePolyKey: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - DECQ itr2 - JNE openAVX2PreparePolyKey - - VPADDD ·chacha20Constants<>(SB), AA0, AA0 - VPADDD state1StoreAVX2, BB0, BB0 - VPADDD state2StoreAVX2, CC0, CC0 - VPADDD ctr3StoreAVX2, DD0, DD0 - - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for the first 64 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX openAVX2InitialHash64: - polyAdd(0(inp)(itr1*1)) - polyMulAVX2 - ADDQ $16, itr1 - CMPQ itr1, $64 - JNE openAVX2InitialHash64 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 // Decrypt the first 64 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), BB0, BB0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU BB0, (1*32)(oup) - LEAQ (2*32)(inp), inp - LEAQ (2*32)(oup), oup - SUBQ $64, inl + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX openAVX2MainLoop: - CMPQ inl, $512 + CMPQ BX, $0x00000200 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX openAVX2InternalLoop: - // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications - // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext - polyAdd(0*8(inp)(itr1*1)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(inp)(itr1*1)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(inp)(itr1*1)) - LEAQ (6*8)(itr1), itr1 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - CMPQ itr1, $480 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 JNE openAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(480(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(496(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - LEAQ (32*16)(oup), oup - SUBQ $(32*16), inl + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openAVX2Tail128 - CMPQ inl, $256 + CMPQ BX, $0x00000100 JBE openAVX2Tail256 - CMPQ inl, $384 + CMPQ BX, $0x00000180 JBE openAVX2Tail384 JMP openAVX2Tail512 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes openAVX2192: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 openAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE openAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 openAVX2ShortOpen: // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 - polyAdd(2*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes openAVX2320: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 openAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE openAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP openAVX2ShortOpen -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD1 - VMOVDQA DD1, DD0 - - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 - TESTQ itr1, itr1 - JE openAVX2Tail128LoopB + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB openAVX2Tail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMulAVX2 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openAVX2Tail128LoopB: - ADDQ $16, itr2 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 - JB openAVX2Tail128LoopA - CMPQ itr2, $160 - JNE openAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC1, CC1 - VPADDD DD0, DD1, DD1 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y13, Y13 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 openAVX2TailLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2Tail - SUBQ $32, inl + SUBQ $0x20, BX // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 JMP openAVX2TailLoop openAVX2Tail: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2TailDone - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare four blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 // Compute the number of iterations that will hash data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $128, itr1 - SHRQ $4, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 openAVX2Tail256LoopA: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX - // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX - CMPQ itr2, $10 - JNE openAVX2Tail256LoopB - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl - - // Hash the remainder of data (if any) openAVX2Tail256Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail256HashEnd - polyAdd (0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail256Hash - -// Store 128 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - - VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 - VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) - LEAQ (4*32)(inp), inp - LEAQ (4*32)(oup), oup - SUBQ $4*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, ctr0StoreAVX2 - VMOVDQA DD1, ctr1StoreAVX2 - VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) // Compute the number of iterations that will hash two blocks of data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $256, itr1 - SHRQ $4, itr1 - ADDQ $6, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 - - // Perform ChaCha rounds, while hashing the remaining input + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + openAVX2Tail384LoopB: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX openAVX2Tail384LoopA: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - - CMPQ itr2, itr1 - JB openAVX2Tail384LoopB - - CMPQ itr2, $10 - JNE openAVX2Tail384LoopA - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX openAVX2Tail384Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail384HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail384Hash - -// Store 256 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash + openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - LEAQ (8*32)(inp), inp - LEAQ (8*32)(oup), oup - SUBQ $8*32, inl + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openAVX2TailLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 - MOVQ inp, itr2 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 openAVX2Tail512LoopB: - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ (2*8)(itr2), itr2 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 openAVX2Tail512LoopA: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(itr2)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(itr2)) - polyMulAVX2 - LEAQ (4*8)(itr2), itr2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - INCQ itr1 - CMPQ itr1, $4 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 JLT openAVX2Tail512LoopB - - CMPQ itr1, $10 - JNE openAVX2Tail512LoopA - - MOVQ inl, itr1 - SUBQ $384, itr1 - ANDQ $-16, itr1 + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX openAVX2Tail512HashLoop: - TESTQ itr1, itr1 + TESTQ CX, CX JE openAVX2Tail512HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - SUBQ $16, itr1 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + SUBQ $0x10, CX JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - LEAQ (12*32)(inp), inp - LEAQ (12*32)(oup), oup - SUBQ $12*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Seal(dst, key, src, ad []byte) -TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 - // For aligned stack access + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop + +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 + +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 + +DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 +DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 +GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 + +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 + +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp - - CMPB ·useAVX2(SB), $1 + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE sealSSE128 // About 15% faster + CMPQ BX, $0x80 + JBE sealSSE128 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store + MOVO X3, 32(BP) + MOVO X6, 48(BP) // Load state, increment counter blocks - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - MOVQ $10, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + MOVQ $0x0000000a, R9 sealSSEIntroLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JNE sealSSEIntroLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JNE sealSSEIntroLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore - MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 - CALL polyHashADInternal<>(SB) - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) - - MOVQ $128, itr1 - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 - - CMPQ inl, $64 - JBE sealSSE128SealHash - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) - - ADDQ $64, itr1 - SUBQ $64, inl - LEAQ 64(inp), inp - - MOVQ $2, itr1 - MOVQ $8, itr2 - - CMPQ inl, $64 - JBE sealSSETail64 - CMPQ inl, $128 - JBE sealSSETail128 - CMPQ inl, $192 - JBE sealSSETail192 + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 64(DI) + MOVOU X5, 80(DI) + MOVOU X8, 96(DI) + MOVOU X11, 112(DI) + MOVQ $0x00000080, CX + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 + JBE sealSSE128SealHash + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 128(DI) + MOVOU X13, 144(DI) + MOVOU X14, 160(DI) + MOVOU X15, 176(DI) + ADDQ $0x40, CX + SUBQ $0x40, BX + LEAQ 64(SI), SI + MOVQ $0x00000002, CX + MOVQ $0x00000008, R9 + CMPQ BX, $0x40 + JBE sealSSETail64 + CMPQ BX, $0x80 + JBE sealSSETail128 + CMPQ BX, $0xc0 + JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) sealSSEInnerLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(oup)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(oup), oup - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JGE sealSSEInnerLoop - polyAdd(0(oup)) - polyMul - LEAQ (2*8)(oup), oup - DECQ itr1 - JG sealSSEInnerLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(DI), DI + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JGE sealSSEInnerLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSEInnerLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVO tmpStore, D3 - - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - ADDQ $192, inp - MOVQ $192, itr1 - SUBQ $192, inl - MOVO A3, A1 - MOVO B3, B1 - MOVO C3, C1 - MOVO D3, D1 - CMPQ inl, $64 + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVO 64(BP), X15 + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + ADDQ $0xc0, SI + MOVQ $0x000000c0, CX + SUBQ $0xc0, BX + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 JBE sealSSE128SealHash - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) - LEAQ 64(inp), inp - SUBQ $64, inl - MOVQ $6, itr1 - MOVQ $4, itr2 - CMPQ inl, $192 + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + LEAQ 64(SI), SI + SUBQ $0x40, BX + MOVQ $0x00000006, CX + MOVQ $0x00000004, R9 + CMPQ BX, $0xc0 JG sealSSEMainLoop - - MOVQ inl, itr1 - TESTQ inl, inl + MOVQ BX, CX + TESTQ BX, BX JE sealSSE128SealHash - MOVQ $6, itr1 - CMPQ inl, $64 + MOVQ $0x00000006, CX + CMPQ BX, $0x40 JBE sealSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE sealSSETail128 JMP sealSSETail192 -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of plaintext sealSSETail64: - // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A1 - MOVO state1Store, B1 - MOVO state2Store, C1 - MOVO ctr3Store, D1 - PADDL ·sseIncMask<>(SB), D1 - MOVO D1, ctr0Store + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) sealSSETail64LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail64LoopB: - chachaQR(A1, B1, C1, D1, T1) - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A1, B1, C1, D1, T1) - shiftB1Right; shiftC1Right; shiftD1Right - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - DECQ itr1 - JG sealSSETail64LoopA - - DECQ itr2 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSETail64LoopA + DECQ R9 JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B1 - PADDL state2Store, C1 - PADDL ctr0Store, D1 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X4 + PADDL 48(BP), X7 + PADDL 80(BP), X10 + JMP sealSSE128Seal - JMP sealSSE128Seal - -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of plaintext sealSSETail128: - // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) sealSSETail128LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail128LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - DECQ itr1 - JG sealSSETail128LoopA - - DECQ itr2 - JGE sealSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr0Store, D0; PADDL ctr1Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - - MOVQ $64, itr1 - LEAQ 64(inp), inp - SUBQ $64, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of plaintext + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + DECQ CX + JG sealSSETail128LoopA + DECQ R9 + JGE sealSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVQ $0x00000040, CX + LEAQ 64(SI), SI + SUBQ $0x40, BX + JMP sealSSE128SealHash + sealSSETail192: - // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 112(BP) sealSSETail192LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail192LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - DECQ itr1 - JG sealSSETail192LoopA - - DECQ itr2 - JGE sealSSETail192LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - MOVO A2, A1 - MOVO B2, B1 - MOVO C2, C1 - MOVO D2, D1 - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special seal optimization for buffers smaller than 129 bytes + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ CX + JG sealSSETail192LoopA + DECQ R9 + JGE sealSSETail192LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + PADDL 112(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + JMP sealSSE128SealHash + sealSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 sealSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE sealSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore - MOVOU B0, sStore + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealSSE128SealHash: - // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealSSE128Seal - polyAdd(0(oup)) - polyMul - - SUBQ $16, itr1 - ADDQ $16, oup - - JMP sealSSE128SealHash + CMPQ CX, $0x10 + JB sealSSE128Seal + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealSSE128SealHash sealSSE128Seal: - CMPQ inl, $16 + CMPQ BX, $0x10 JB sealSSETail - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - MOVOU (inp), T0 - PXOR T0, A1 - MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI // Extract for hashing - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP sealSSE128Seal sealSSETail: - TESTQ inl, inl + TESTQ BX, BX JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVQ inl, itr1 - LEAQ -1(inp)(inl*1), inp - XORQ t2, t2 - XORQ t3, t3 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 XORQ AX, AX sealSSETailLoadLoop: - SHLQ $8, t2, t3 - SHLQ $8, t2 - MOVB (inp), AX - XORQ AX, t2 - LEAQ -1(inp), inp - DECQ itr1 + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX JNE sealSSETailLoadLoop - MOVQ t2, 0+tmpStore - MOVQ t3, 8+tmpStore - PXOR 0+tmpStore, A1 - MOVOU A1, (oup) - MOVOU -16(t0)(itr2*1), T0 - PAND T0, A1 - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - ADDQ inl, oup + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI sealSSEFinalize: // Hash in the buffer lengths - ADDQ ad_len+80(FP), acc0 - ADCQ src_len+56(FP), acc1 - ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally store the tag at the end of the message - MOVQ acc0, (0*8)(oup) - MOVQ acc1, (1*8)(oup) + MOVQ R10, (DI) + MOVQ R11, 8(DI) RET -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers - CMPQ inl, $192 - JBE seal192AVX2 // 33% faster - CMPQ inl, $320 - JBE seal320AVX2 // 17% faster + CMPQ BX, $0x000000c0 + JBE seal192AVX2 + CMPQ BX, $0x00000140 + JBE seal320AVX2 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 - VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA Y12, 64(BP) + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, 96(BP) + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y1, 128(BP) + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, R9 sealAVX2IntroLoop: - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr2 - JNE sealAVX2IntroLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - - VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 - VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key - VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ R9 + JNE sealAVX2IntroLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPERM2I128 $0x02, Y0, Y14, Y4 + VPERM2I128 $0x13, Y0, Y14, Y0 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), DD0, DD0 - VMOVDQA DD0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, (BP) // Hash AD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), CC0, CC0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU CC0, (1*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 - VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 - VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) - - MOVQ $320, itr1 - SUBQ $320, inl - LEAQ 320(inp), inp - - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 - CMPQ inl, $128 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y12, Y12 + VMOVDQU Y0, (DI) + VMOVDQU Y12, 32(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 64(SI), Y0, Y0 + VPXOR 96(SI), Y14, Y14 + VPXOR 128(SI), Y12, Y12 + VPXOR 160(SI), Y4, Y4 + VMOVDQU Y0, 64(DI) + VMOVDQU Y14, 96(DI) + VMOVDQU Y12, 128(DI) + VMOVDQU Y4, 160(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 192(SI), Y0, Y0 + VPXOR 224(SI), Y14, Y14 + VPXOR 256(SI), Y12, Y12 + VPXOR 288(SI), Y4, Y4 + VMOVDQU Y0, 192(DI) + VMOVDQU Y14, 224(DI) + VMOVDQU Y12, 256(DI) + VMOVDQU Y4, 288(DI) + MOVQ $0x00000140, CX + SUBQ $0x00000140, BX + LEAQ 320(SI), SI + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, Y15, Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, Y15, Y3, Y4 + CMPQ BX, $0x80 JBE sealAVX2SealHash - - VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 - VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVQ $8, itr1 - MOVQ $2, itr2 - - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - CMPQ inl, $512 - JBE sealAVX2Tail512 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VPXOR 64(SI), Y12, Y12 + VPXOR 96(SI), Y4, Y4 + VMOVDQU Y0, 320(DI) + VMOVDQU Y14, 352(DI) + VMOVDQU Y12, 384(DI) + VMOVDQU Y4, 416(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVQ $0x00000008, CX + MOVQ $0x00000002, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + CMPQ BX, $0x00000200 + JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - - SUBQ $16, oup // Adjust the pointer - MOVQ $9, itr1 - JMP sealAVX2InternalLoopStart + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + SUBQ $0x10, DI + MOVQ $0x00000009, CX + JMP sealAVX2InternalLoopStart sealAVX2MainLoop: - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, CX sealAVX2InternalLoop: - polyAdd(0*8(oup)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 sealAVX2InternalLoopStart: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(oup)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(oup)) - LEAQ (6*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr1 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(DI), R10 + ADCQ 40(DI), R11 + ADCQ $0x01, R12 + LEAQ 48(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX JNE sealAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(-2*8(oup)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - SUBQ $(32*16), inl - CMPQ inl, $512 + ADDQ -16(DI), R10 + ADCQ -8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + SUBQ $0x00000200, BX + CMPQ BX, $0x00000200 JG sealAVX2MainLoop // Tail can only hash 480 bytes - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ 32(oup), oup - - MOVQ $10, itr1 - MOVQ $0, itr2 - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - JMP sealAVX2Tail512 - -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + MOVQ $0x0000000a, CX + MOVQ $0x00000000, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + seal192AVX2: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 sealAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE sealAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 sealAVX2ShortSeal: // Hash aad - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealAVX2ShortSealLoop - polyAdd(0(oup)) - polyMul - SUBQ $16, itr1 - ADDQ $16, oup - JMP sealAVX2SealHash + CMPQ CX, $0x10 + JB sealAVX2ShortSealLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealAVX2SealHash sealAVX2ShortSealLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB sealAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for encryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI // Now can hash - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (1*32)(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB sealAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for encryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI // Hash - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes seal320AVX2: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 sealAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE sealAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP sealAVX2ShortSeal -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext sealAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0 - VMOVDQA state1StoreAVX2, BB0 - VMOVDQA state2StoreAVX2, CC0 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VMOVDQA DD0, DD1 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA 32(BP), Y14 + VMOVDQA 64(BP), Y12 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, Y1 sealAVX2Tail128LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail128LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $4, DD0, DD0, DD0 - DECQ itr1 - JG sealAVX2Tail128LoopA - DECQ itr2 - JGE sealAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA1 - VPADDD state1StoreAVX2, BB0, BB1 - VPADDD state2StoreAVX2, CC0, CC1 - VPADDD DD1, DD0, DD1 - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ CX + JG sealAVX2Tail128LoopA + DECQ R9 + JGE sealAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 + VPADDD 32(BP), Y14, Y9 + VPADDD 64(BP), Y12, Y13 + VPADDD Y1, Y4, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2ShortSealLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext sealAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 sealAVX2Tail256LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr1 - JG sealAVX2Tail256LoopA - DECQ itr2 - JGE sealAVX2Tail256LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ CX + JG sealAVX2Tail256LoopA + DECQ R9 + JGE sealAVX2Tail256LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + JMP sealAVX2SealHash + sealAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + VMOVDQA Y2, Y15 sealAVX2Tail384LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail384LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr1 - JG sealAVX2Tail384LoopA - DECQ itr2 - JGE sealAVX2Tail384LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0 - VPERM2I128 $0x02, CC1, DD1, TT1 - VPERM2I128 $0x13, AA1, BB1, TT2 - VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - MOVQ $256, itr1 - LEAQ 256(inp), inp - SUBQ $256, inl - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ CX + JG sealAVX2Tail384LoopA + DECQ R9 + JGE sealAVX2Tail384LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPADDD Y15, Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + MOVQ $0x00000100, CX + LEAQ 256(SI), SI + SUBQ $0x00000100, BX + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + JMP sealAVX2SealHash + sealAVX2Tail512: - // Need to decrypt up to 512 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) sealAVX2Tail512LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail512LoopB: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(oup)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - - DECQ itr1 - JG sealAVX2Tail512LoopA - DECQ itr2 - JGE sealAVX2Tail512LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3 - VPXOR (0*32)(inp), CC3, CC3 - VMOVDQU CC3, (0*32)(oup) - VPERM2I128 $0x02, CC0, DD0, CC3 - VPXOR (1*32)(inp), CC3, CC3 - VMOVDQU CC3, (1*32)(oup) - VPERM2I128 $0x13, AA0, BB0, CC3 - VPXOR (2*32)(inp), CC3, CC3 - VMOVDQU CC3, (2*32)(oup) - VPERM2I128 $0x13, CC0, DD0, CC3 - VPXOR (3*32)(inp), CC3, CC3 - VMOVDQU CC3, (3*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - - MOVQ $384, itr1 - LEAQ 384(inp), inp - SUBQ $384, inl - VPERM2I128 $0x02, AA3, BB3, AA0 - VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 - VPERM2I128 $0x13, AA3, BB3, CC0 - VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - JMP sealAVX2SealHash + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX + JG sealAVX2Tail512LoopA + DECQ R9 + JGE sealAVX2Tail512LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPXOR (SI), Y15, Y15 + VMOVDQU Y15, (DI) + VPERM2I128 $0x02, Y12, Y4, Y15 + VPXOR 32(SI), Y15, Y15 + VMOVDQU Y15, 32(DI) + VPERM2I128 $0x13, Y0, Y14, Y15 + VPXOR 64(SI), Y15, Y15 + VMOVDQU Y15, 64(DI) + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + MOVQ $0x00000180, CX + LEAQ 384(SI), SI + SUBQ $0x00000180, BX + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + JMP sealAVX2SealHash From bcb0f91bbceb3486cc7f10102ff046661fb4d364 Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Sun, 21 Jul 2024 14:23:57 -0400 Subject: [PATCH 38/57] internal/poly1305: Port sum_amd64.s to Avo This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="internal/poly1305/sum_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I80212c95d1b05335d7f6b73a3030b6f812f6105b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600035 Reviewed-by: Roland Shoemaker Reviewed-by: Filippo Valsorda Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI --- internal/poly1305/_asm/go.mod | 15 +++ internal/poly1305/_asm/go.sum | 12 +++ internal/poly1305/_asm/sum_amd64_asm.go | 126 ++++++++++++++++++++++ internal/poly1305/sum_amd64.s | 133 +++++++++++------------- 4 files changed, 212 insertions(+), 74 deletions(-) create mode 100644 internal/poly1305/_asm/go.mod create mode 100644 internal/poly1305/_asm/go.sum create mode 100644 internal/poly1305/_asm/sum_amd64_asm.go diff --git a/internal/poly1305/_asm/go.mod b/internal/poly1305/_asm/go.mod new file mode 100644 index 0000000000..47f2b758ef --- /dev/null +++ b/internal/poly1305/_asm/go.mod @@ -0,0 +1,15 @@ +module internal/poly1305/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/internal/poly1305/_asm/go.sum b/internal/poly1305/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/internal/poly1305/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/internal/poly1305/_asm/sum_amd64_asm.go b/internal/poly1305/_asm/sum_amd64_asm.go new file mode 100644 index 0000000000..a445c68f01 --- /dev/null +++ b/internal/poly1305/_asm/sum_amd64_asm.go @@ -0,0 +1,126 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/sha3" +) + +//go:generate go run . -out ../sum_amd64.s -pkg poly1305 + +func main() { + Package("golang.org/x/crypto/internal/poly1305") + ConstraintExpr("gc,!purego") + update() + Generate() +} + +func update() { + Implement("update") + + Load(Param("state"), RDI) + MOVQ(NewParamAddr("msg_base", 8), RSI) + MOVQ(NewParamAddr("msg_len", 16), R15) + + MOVQ(Mem{Base: DI}.Offset(0), R8) // h0 + MOVQ(Mem{Base: DI}.Offset(8), R9) // h1 + MOVQ(Mem{Base: DI}.Offset(16), R10) // h2 + MOVQ(Mem{Base: DI}.Offset(24), R11) // r0 + MOVQ(Mem{Base: DI}.Offset(32), R12) // r1 + + CMPQ(R15, Imm(16)) + JB(LabelRef("bytes_between_0_and_15")) + + Label("loop") + POLY1305_ADD(RSI, R8, R9, R10) + + Label("multiply") + POLY1305_MUL(R8, R9, R10, R11, R12, RBX, RCX, R13, R14) + SUBQ(Imm(16), R15) + CMPQ(R15, Imm(16)) + JAE(LabelRef("loop")) + + Label("bytes_between_0_and_15") + TESTQ(R15, R15) + JZ(LabelRef("done")) + MOVQ(U32(1), RBX) + XORQ(RCX, RCX) + XORQ(R13, R13) + ADDQ(R15, RSI) + + Label("flush_buffer") + SHLQ(Imm(8), RBX, RCX) + SHLQ(Imm(8), RBX) + MOVB(Mem{Base: SI}.Offset(-1), R13B) + XORQ(R13, RBX) + DECQ(RSI) + DECQ(R15) + JNZ(LabelRef("flush_buffer")) + + ADDQ(RBX, R8) + ADCQ(RCX, R9) + ADCQ(Imm(0), R10) + MOVQ(U32(16), R15) + JMP(LabelRef("multiply")) + + Label("done") + MOVQ(R8, Mem{Base: DI}.Offset(0)) + MOVQ(R9, Mem{Base: DI}.Offset(8)) + MOVQ(R10, Mem{Base: DI}.Offset(16)) + RET() +} + +func POLY1305_ADD(msg, h0, h1, h2 GPPhysical) { + ADDQ(Mem{Base: msg}.Offset(0), h0) + ADCQ(Mem{Base: msg}.Offset(8), h1) + ADCQ(Imm(1), h2) + LEAQ(Mem{Base: msg}.Offset(16), msg) +} + +func POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3 GPPhysical) { + MOVQ(r0, RAX) + MULQ(h0) + MOVQ(RAX, t0) + MOVQ(RDX, t1) + MOVQ(r0, RAX) + MULQ(h1) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(r0, t2) + IMULQ(h2, t2) + ADDQ(RDX, t2) + + MOVQ(r1, RAX) + MULQ(h0) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, h0) + MOVQ(r1, t3) + IMULQ(h2, t3) + MOVQ(r1, RAX) + MULQ(h1) + ADDQ(RAX, t2) + ADCQ(RDX, t3) + ADDQ(h0, t2) + ADCQ(Imm(0), t3) + + MOVQ(t0, h0) + MOVQ(t1, h1) + MOVQ(t2, h2) + ANDQ(Imm(3), h2) + MOVQ(t2, t0) + ANDQ(I32(-4), t0) + ADDQ(t0, h0) + ADCQ(t3, h1) + ADCQ(Imm(0), h2) + SHRQ(Imm(2), t3, t2) + SHRQ(Imm(2), t3) + ADDQ(t2, h0) + ADCQ(t3, h1) + ADCQ(Imm(0), h2) +} diff --git a/internal/poly1305/sum_amd64.s b/internal/poly1305/sum_amd64.s index e0d3c64756..133757384b 100644 --- a/internal/poly1305/sum_amd64.s +++ b/internal/poly1305/sum_amd64.s @@ -1,108 +1,93 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT. //go:build gc && !purego -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) +// func update(state *macState, msg []byte) TEXT ·update(SB), $0-32 MOVQ state+0(FP), DI MOVQ msg_base+8(FP), SI MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 + MOVQ (DI), R8 + MOVQ 8(DI), R9 + MOVQ 16(DI), R10 + MOVQ 24(DI), R11 + MOVQ 32(DI), R12 + CMPQ R15, $0x10 JB bytes_between_0_and_15 loop: - POLY1305_ADD(SI, R8, R9, R10) + ADDQ (SI), R8 + ADCQ 8(SI), R9 + ADCQ $0x01, R10 + LEAQ 16(SI), SI multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop + MOVQ R11, AX + MULQ R8 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R11, AX + MULQ R9 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ R11, R13 + IMULQ R10, R13 + ADDQ DX, R13 + MOVQ R12, AX + MULQ R8 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R12, R14 + IMULQ R10, R14 + MOVQ R12, AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R14 + ADDQ R8, R13 + ADCQ $0x00, R14 + MOVQ BX, R8 + MOVQ CX, R9 + MOVQ R13, R10 + ANDQ $0x03, R10 + MOVQ R13, BX + ANDQ $-4, BX + ADDQ BX, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SHRQ $0x02, R14, R13 + SHRQ $0x02, R14 + ADDQ R13, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SUBQ $0x10, R15 + CMPQ R15, $0x10 + JAE loop bytes_between_0_and_15: TESTQ R15, R15 JZ done - MOVQ $1, BX + MOVQ $0x00000001, BX XORQ CX, CX XORQ R13, R13 ADDQ R15, SI flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX + SHLQ $0x08, BX, CX + SHLQ $0x08, BX MOVB -1(SI), R13 XORQ R13, BX DECQ SI DECQ R15 JNZ flush_buffer - ADDQ BX, R8 ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 + ADCQ $0x00, R10 + MOVQ $0x00000010, R15 JMP multiply done: - MOVQ R8, 0(DI) + MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) RET From b35ab4fde0e27d900fc800ae12370c858b58ba41 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Wed, 4 Sep 2024 15:05:55 +0000 Subject: [PATCH 39/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I94bb1c6a4bb08aff8c146e84a9d4b3e353f098c2 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/610638 LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Michael Pratt Auto-Submit: Gopher Robot --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index e206587663..d3527d40d7 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.23.0 - golang.org/x/term v0.23.0 + golang.org/x/sys v0.25.0 + golang.org/x/term v0.24.0 ) -require golang.org/x/text v0.17.0 // indirect +require golang.org/x/text v0.18.0 // indirect diff --git a/go.sum b/go.sum index c7231381e6..b347167687 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= -golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= -golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= -golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= -golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM= +golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= From c9da6b9a4008902aae7c754e8f01d42e2d2cf205 Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Wed, 4 Sep 2024 17:15:43 -0400 Subject: [PATCH 40/57] all: fix printf(var) mistakes detected by latest printf checker These were problematic but previously easy to miss. They're now easy to spot thanks to build failures at Go tip as of CL 610736. For golang/go#68796. Change-Id: I167f2cce2376b4070460389c673d973e4521d3dc Reviewed-on: https://go-review.googlesource.com/c/crypto/+/610797 Auto-Submit: Dmitri Shuralyov Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Alan Donovan --- acme/autocert/internal/acmetest/ca.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/acme/autocert/internal/acmetest/ca.go b/acme/autocert/internal/acmetest/ca.go index 0a5ebe7ab7..504a9a0e07 100644 --- a/acme/autocert/internal/acmetest/ca.go +++ b/acme/autocert/internal/acmetest/ca.go @@ -308,7 +308,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { } if err := decodePayload(&req, r.Body); err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } @@ -328,7 +328,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { Identifiers []struct{ Value string } } if err := decodePayload(&req, r.Body); err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } ca.mu.Lock() @@ -352,7 +352,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { defer ca.mu.Unlock() o, err := ca.storedOrder(strings.TrimPrefix(r.URL.Path, "/orders/")) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } if err := json.NewEncoder(w).Encode(o); err != nil { @@ -412,7 +412,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { orderID := strings.TrimPrefix(r.URL.Path, "/new-cert/") o, err := ca.storedOrder(orderID) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } if o.Status != acme.StatusReady { @@ -427,7 +427,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { b, _ := base64.RawURLEncoding.DecodeString(req.CSR) csr, err := x509.ParseCertificateRequest(b) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } // Issue the certificate. @@ -449,7 +449,7 @@ func (ca *CAServer) handle(w http.ResponseWriter, r *http.Request) { defer ca.mu.Unlock() o, err := ca.storedOrder(strings.TrimPrefix(r.URL.Path, "/issued-cert/")) if err != nil { - ca.httpErrorf(w, http.StatusBadRequest, err.Error()) + ca.httpErrorf(w, http.StatusBadRequest, "%v", err) return } if o.Status != acme.StatusValid { From 9e92970a1eb41e446822e037016aa89d24c0ce7a Mon Sep 17 00:00:00 2001 From: cuishuang Date: Mon, 9 Sep 2024 14:22:39 +0800 Subject: [PATCH 41/57] bn256: add missing symbols in comment Change-Id: Ibd48a070bd8ce35ef5795a8b73bc4ecac43a993e Reviewed-on: https://go-review.googlesource.com/c/crypto/+/611735 Run-TryBot: shuang cui Commit-Queue: Ian Lance Taylor LUCI-TryBot-Result: Go LUCI Auto-Submit: Ian Lance Taylor Reviewed-by: Roland Shoemaker TryBot-Result: Gopher Robot Reviewed-by: Ian Lance Taylor --- bn256/gfp12.go | 2 +- bn256/gfp2.go | 2 +- bn256/gfp6.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bn256/gfp12.go b/bn256/gfp12.go index 2b0151ebcc..b05a8b727f 100644 --- a/bn256/gfp12.go +++ b/bn256/gfp12.go @@ -5,7 +5,7 @@ package bn256 // For details of the algorithms used, see "Multiplication and Squaring on -// Pairing-Friendly Fields, Devegili et al. +// Pairing-Friendly Fields", Devegili et al. // http://eprint.iacr.org/2006/471.pdf. import ( diff --git a/bn256/gfp2.go b/bn256/gfp2.go index 97f3f1f3fa..aa39a3043b 100644 --- a/bn256/gfp2.go +++ b/bn256/gfp2.go @@ -5,7 +5,7 @@ package bn256 // For details of the algorithms used, see "Multiplication and Squaring on -// Pairing-Friendly Fields, Devegili et al. +// Pairing-Friendly Fields", Devegili et al. // http://eprint.iacr.org/2006/471.pdf. import ( diff --git a/bn256/gfp6.go b/bn256/gfp6.go index f98ae782cc..7dec5eabd6 100644 --- a/bn256/gfp6.go +++ b/bn256/gfp6.go @@ -5,7 +5,7 @@ package bn256 // For details of the algorithms used, see "Multiplication and Squaring on -// Pairing-Friendly Fields, Devegili et al. +// Pairing-Friendly Fields", Devegili et al. // http://eprint.iacr.org/2006/471.pdf. import ( From 42ee18b963777d907bbef3e59665cf80968d57e6 Mon Sep 17 00:00:00 2001 From: Nicola Murino Date: Sun, 25 Feb 2024 16:26:56 +0100 Subject: [PATCH 42/57] ssh: return ServerAuthError after too many auth failures if a client is disconnected due to too many authentication attempts we should return a ServerAuthError instead of a generic error. Some users check the error returned by NewServerConn to determine whether or not a client attempted to authenticate. Fixes golang/go#69191 Change-Id: If68fcecdefd6c810fe9df8256b1216e320d8a916 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/566398 Reviewed-by: Filippo Valsorda Reviewed-by: Tim King Auto-Submit: Nicola Murino LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee --- ssh/client_auth_test.go | 25 ++++++++++++++++++------- ssh/server.go | 4 ++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/ssh/client_auth_test.go b/ssh/client_auth_test.go index bf0aa1fe23..e981cc49a6 100644 --- a/ssh/client_auth_test.go +++ b/ssh/client_auth_test.go @@ -641,17 +641,28 @@ func TestClientAuthMaxAuthTries(t *testing.T) { defer c1.Close() defer c2.Close() - go newServer(c1, serverConfig) - _, _, _, err = NewClientConn(c2, "", clientConfig) - if tries > 2 { - if err == nil { + errCh := make(chan error, 1) + + go func() { + _, err := newServer(c1, serverConfig) + errCh <- err + }() + _, _, _, cliErr := NewClientConn(c2, "", clientConfig) + srvErr := <-errCh + + if tries > serverConfig.MaxAuthTries { + if cliErr == nil { t.Fatalf("client: got no error, want %s", expectedErr) - } else if err.Error() != expectedErr.Error() { + } else if cliErr.Error() != expectedErr.Error() { t.Fatalf("client: got %s, want %s", err, expectedErr) } + var authErr *ServerAuthError + if !errors.As(srvErr, &authErr) { + t.Errorf("expected ServerAuthError, got: %v", srvErr) + } } else { - if err != nil { - t.Fatalf("client: got %s, want no error", err) + if cliErr != nil { + t.Fatalf("client: got %s, want no error", cliErr) } } } diff --git a/ssh/server.go b/ssh/server.go index 3ca9e89e22..c0d1c29e6f 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -510,8 +510,8 @@ userAuthLoop: if err := s.transport.writePacket(Marshal(discMsg)); err != nil { return nil, err } - - return nil, discMsg + authErrs = append(authErrs, discMsg) + return nil, &ServerAuthError{Errors: authErrs} } var userAuthReq userAuthRequestMsg From a0819fbb0244af70857f03b6984e1d4f93e6cabf Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 11 Mar 2024 23:44:23 +0000 Subject: [PATCH 43/57] sha3: fix cSHAKE initialization for extremely large N and or S While both impractical and unlikely, the multiplication could overflow on 32-bit architectures. The 64-bit architecture case is unaffected by both the maximum length of Go slices being too small to trigger the overflow (everything except s390), and it being safe to assume no machine has more than 2 EiB of memory. Fixes golang/go#66232 Change-Id: I19c15d42d2d6af35e296697159d43d02f513e614 GitHub-Last-Rev: 503e180debfdc93ab99977172af2b64290cb80e8 GitHub-Pull-Request: golang/crypto#286 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/570876 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase Reviewed-by: Filippo Valsorda Auto-Submit: Filippo Valsorda Reviewed-by: Michael Knyszek --- sha3/shake.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sha3/shake.go b/sha3/shake.go index 1ea9275b8b..a01ef43577 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -85,9 +85,9 @@ func newCShake(N, S []byte, rate, outputLen int, dsbyte byte) ShakeHash { // leftEncode returns max 9 bytes c.initBlock = make([]byte, 0, 9*2+len(N)+len(S)) - c.initBlock = append(c.initBlock, leftEncode(uint64(len(N)*8))...) + c.initBlock = append(c.initBlock, leftEncode(uint64(len(N))*8)...) c.initBlock = append(c.initBlock, N...) - c.initBlock = append(c.initBlock, leftEncode(uint64(len(S)*8))...) + c.initBlock = append(c.initBlock, leftEncode(uint64(len(S))*8)...) c.initBlock = append(c.initBlock, S...) c.Write(bytepad(c.initBlock, c.rate)) return &c From adef4cc1a8c2ca4da1b1f4e6c976b59ca22dbfb8 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Fri, 4 Oct 2024 15:23:27 +0000 Subject: [PATCH 44/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Id321d3b5909ecb66c0311ba86008509c7895863b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/617958 Auto-Submit: Gopher Robot Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index d3527d40d7..ecf61b3e18 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.25.0 - golang.org/x/term v0.24.0 + golang.org/x/sys v0.26.0 + golang.org/x/term v0.25.0 ) -require golang.org/x/text v0.18.0 // indirect +require golang.org/x/text v0.19.0 // indirect diff --git a/go.sum b/go.sum index b347167687..5b12ab8eee 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= -golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM= -golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= -golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= -golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= From 6c2174895805a9c4273cf0052c9ff61ba3e75812 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Wed, 18 Sep 2024 20:36:04 +0000 Subject: [PATCH 45/57] internal/poly1305: extend ppc64le support to ppc64 The cipher needs to load the stream in LE order. Use the byte reversing loads on BE. Also, remove the unused variable poly1305Mask in the PPC64 asm file too. Change-Id: Ie90fe7bb0ea7a3bcb76583e0cf9c1e4133499541 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/614298 Reviewed-by: Michael Knyszek Reviewed-by: Archana Ravindar LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- internal/poly1305/mac_noasm.go | 2 +- .../{sum_ppc64le.go => sum_ppc64x.go} | 2 +- .../poly1305/{sum_ppc64le.s => sum_ppc64x.s} | 30 ++++++++++++------- 3 files changed, 21 insertions(+), 13 deletions(-) rename internal/poly1305/{sum_ppc64le.go => sum_ppc64x.go} (95%) rename internal/poly1305/{sum_ppc64le.s => sum_ppc64x.s} (89%) diff --git a/internal/poly1305/mac_noasm.go b/internal/poly1305/mac_noasm.go index 333da285b3..bd896bdc76 100644 --- a/internal/poly1305/mac_noasm.go +++ b/internal/poly1305/mac_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego +//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego package poly1305 diff --git a/internal/poly1305/sum_ppc64le.go b/internal/poly1305/sum_ppc64x.go similarity index 95% rename from internal/poly1305/sum_ppc64le.go rename to internal/poly1305/sum_ppc64x.go index 4aec4874b5..1a1679aaad 100644 --- a/internal/poly1305/sum_ppc64le.go +++ b/internal/poly1305/sum_ppc64x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) package poly1305 diff --git a/internal/poly1305/sum_ppc64le.s b/internal/poly1305/sum_ppc64x.s similarity index 89% rename from internal/poly1305/sum_ppc64le.s rename to internal/poly1305/sum_ppc64x.s index b3c1699bff..6899a1dabc 100644 --- a/internal/poly1305/sum_ppc64le.s +++ b/internal/poly1305/sum_ppc64x.s @@ -2,15 +2,25 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) #include "textflag.h" // This was ported from the amd64 implementation. +#ifdef GOARCH_ppc64le +#define LE_MOVD MOVD +#define LE_MOVWZ MOVWZ +#define LE_MOVHZ MOVHZ +#else +#define LE_MOVD MOVDBR +#define LE_MOVWZ MOVWBR +#define LE_MOVHZ MOVHBR +#endif + #define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \ - MOVD (msg), t0; \ - MOVD 8(msg), t1; \ + LE_MOVD (msg)( R0), t0; \ + LE_MOVD (msg)(R24), t1; \ MOVD $1, t2; \ ADDC t0, h0, h0; \ ADDE t1, h1, h1; \ @@ -50,10 +60,6 @@ ADDE t3, h1, h1; \ ADDZE h2 -DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -GLOBL ·poly1305Mask<>(SB), RODATA, $16 - // func update(state *[7]uint64, msg []byte) TEXT ·update(SB), $0-32 MOVD state+0(FP), R3 @@ -66,6 +72,8 @@ TEXT ·update(SB), $0-32 MOVD 24(R3), R11 // r0 MOVD 32(R3), R12 // r1 + MOVD $8, R24 + CMP R5, $16 BLT bytes_between_0_and_15 @@ -94,7 +102,7 @@ flush_buffer: // Greater than 8 -- load the rightmost remaining bytes in msg // and put into R17 (h1) - MOVD (R4)(R21), R17 + LE_MOVD (R4)(R21), R17 MOVD $16, R22 // Find the offset to those bytes @@ -118,7 +126,7 @@ just1: BLT less8 // Exactly 8 - MOVD (R4), R16 + LE_MOVD (R4), R16 CMP R17, $0 @@ -133,7 +141,7 @@ less8: MOVD $0, R22 // shift count CMP R5, $4 BLT less4 - MOVWZ (R4), R16 + LE_MOVWZ (R4), R16 ADD $4, R4 ADD $-4, R5 MOVD $32, R22 @@ -141,7 +149,7 @@ less8: less4: CMP R5, $2 BLT less2 - MOVHZ (R4), R21 + LE_MOVHZ (R4), R21 SLD R22, R21, R21 OR R16, R21, R16 ADD $16, R22 From b61b08db44b86a0fb8510036a4655fc4a3d37cd3 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Fri, 13 Sep 2024 19:58:45 +0000 Subject: [PATCH 46/57] chacha20: extend ppc64le support to ppc64 This requires fixing an incorrect save of the counter. It is a word value. It happens to work on LE because length is limited to u32. Refactor the constant table to load correctly independent of byte ordering. Add byte order swapping where output needs converted to LE ordering for storage. Change-Id: Ic7e09bd1c769bb77dd6e817f5a8639ba765f4c0f Reviewed-on: https://go-review.googlesource.com/c/crypto/+/614297 Reviewed-by: Cherry Mui Reviewed-by: Michael Knyszek Reviewed-by: Archana Ravindar LUCI-TryBot-Result: Go LUCI --- chacha20/chacha_noasm.go | 2 +- .../{chacha_ppc64le.go => chacha_ppc64x.go} | 2 +- .../{chacha_ppc64le.s => chacha_ppc64x.s} | 114 +++++++++++++----- 3 files changed, 88 insertions(+), 30 deletions(-) rename chacha20/{chacha_ppc64le.go => chacha_ppc64x.go} (89%) rename chacha20/{chacha_ppc64le.s => chacha_ppc64x.s} (76%) diff --git a/chacha20/chacha_noasm.go b/chacha20/chacha_noasm.go index db42e6676a..c709b72847 100644 --- a/chacha20/chacha_noasm.go +++ b/chacha20/chacha_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego +//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego package chacha20 diff --git a/chacha20/chacha_ppc64le.go b/chacha20/chacha_ppc64x.go similarity index 89% rename from chacha20/chacha_ppc64le.go rename to chacha20/chacha_ppc64x.go index 3a4287f990..bd183d9ba1 100644 --- a/chacha20/chacha_ppc64le.go +++ b/chacha20/chacha_ppc64x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) package chacha20 diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64x.s similarity index 76% rename from chacha20/chacha_ppc64le.s rename to chacha20/chacha_ppc64x.s index c672ccf698..a660b4112f 100644 --- a/chacha20/chacha_ppc64le.s +++ b/chacha20/chacha_ppc64x.s @@ -19,7 +19,7 @@ // The differences in this and the original implementation are // due to the calling conventions and initialization of constants. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) #include "textflag.h" @@ -36,32 +36,68 @@ // for VPERMXOR #define MASK R18 -DATA consts<>+0x00(SB)/8, $0x3320646e61707865 -DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 -DATA consts<>+0x10(SB)/8, $0x0000000000000001 -DATA consts<>+0x18(SB)/8, $0x0000000000000000 -DATA consts<>+0x20(SB)/8, $0x0000000000000004 -DATA consts<>+0x28(SB)/8, $0x0000000000000000 -DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d -DATA consts<>+0x38(SB)/8, $0x0203000106070405 -DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c -DATA consts<>+0x48(SB)/8, $0x0102030005060704 -DATA consts<>+0x50(SB)/8, $0x6170786561707865 -DATA consts<>+0x58(SB)/8, $0x6170786561707865 -DATA consts<>+0x60(SB)/8, $0x3320646e3320646e -DATA consts<>+0x68(SB)/8, $0x3320646e3320646e -DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 -DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 -DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 -DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 -DATA consts<>+0x90(SB)/8, $0x0000000100000000 -DATA consts<>+0x98(SB)/8, $0x0000000300000002 -DATA consts<>+0xa0(SB)/8, $0x5566774411223300 -DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 -DATA consts<>+0xb0(SB)/8, $0x6677445522330011 -DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +DATA consts<>+0x00(SB)/4, $0x61707865 +DATA consts<>+0x04(SB)/4, $0x3320646e +DATA consts<>+0x08(SB)/4, $0x79622d32 +DATA consts<>+0x0c(SB)/4, $0x6b206574 +DATA consts<>+0x10(SB)/4, $0x00000001 +DATA consts<>+0x14(SB)/4, $0x00000000 +DATA consts<>+0x18(SB)/4, $0x00000000 +DATA consts<>+0x1c(SB)/4, $0x00000000 +DATA consts<>+0x20(SB)/4, $0x00000004 +DATA consts<>+0x24(SB)/4, $0x00000000 +DATA consts<>+0x28(SB)/4, $0x00000000 +DATA consts<>+0x2c(SB)/4, $0x00000000 +DATA consts<>+0x30(SB)/4, $0x0e0f0c0d +DATA consts<>+0x34(SB)/4, $0x0a0b0809 +DATA consts<>+0x38(SB)/4, $0x06070405 +DATA consts<>+0x3c(SB)/4, $0x02030001 +DATA consts<>+0x40(SB)/4, $0x0d0e0f0c +DATA consts<>+0x44(SB)/4, $0x090a0b08 +DATA consts<>+0x48(SB)/4, $0x05060704 +DATA consts<>+0x4c(SB)/4, $0x01020300 +DATA consts<>+0x50(SB)/4, $0x61707865 +DATA consts<>+0x54(SB)/4, $0x61707865 +DATA consts<>+0x58(SB)/4, $0x61707865 +DATA consts<>+0x5c(SB)/4, $0x61707865 +DATA consts<>+0x60(SB)/4, $0x3320646e +DATA consts<>+0x64(SB)/4, $0x3320646e +DATA consts<>+0x68(SB)/4, $0x3320646e +DATA consts<>+0x6c(SB)/4, $0x3320646e +DATA consts<>+0x70(SB)/4, $0x79622d32 +DATA consts<>+0x74(SB)/4, $0x79622d32 +DATA consts<>+0x78(SB)/4, $0x79622d32 +DATA consts<>+0x7c(SB)/4, $0x79622d32 +DATA consts<>+0x80(SB)/4, $0x6b206574 +DATA consts<>+0x84(SB)/4, $0x6b206574 +DATA consts<>+0x88(SB)/4, $0x6b206574 +DATA consts<>+0x8c(SB)/4, $0x6b206574 +DATA consts<>+0x90(SB)/4, $0x00000000 +DATA consts<>+0x94(SB)/4, $0x00000001 +DATA consts<>+0x98(SB)/4, $0x00000002 +DATA consts<>+0x9c(SB)/4, $0x00000003 +DATA consts<>+0xa0(SB)/4, $0x11223300 +DATA consts<>+0xa4(SB)/4, $0x55667744 +DATA consts<>+0xa8(SB)/4, $0x99aabb88 +DATA consts<>+0xac(SB)/4, $0xddeeffcc +DATA consts<>+0xb0(SB)/4, $0x22330011 +DATA consts<>+0xb4(SB)/4, $0x66774455 +DATA consts<>+0xb8(SB)/4, $0xaabb8899 +DATA consts<>+0xbc(SB)/4, $0xeeffccdd GLOBL consts<>(SB), RODATA, $0xc0 +#ifdef GOARCH_ppc64 +#define BE_XXBRW_INIT() \ + LVSL (R0)(R0), V24 \ + VSPLTISB $3, V25 \ + VXOR V24, V25, V24 \ + +#define BE_XXBRW(vr) VPERM vr, vr, V24, vr +#else +#define BE_XXBRW_INIT() +#define BE_XXBRW(vr) +#endif + //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD out+0(FP), OUT @@ -94,6 +130,8 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // Clear V27 VXOR V27, V27, V27 + BE_XXBRW_INIT() + // V28 LXVW4X (CONSTBASE)(R11), VS60 @@ -299,6 +337,11 @@ loop_vsx: VADDUWM V8, V18, V8 VADDUWM V12, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -327,6 +370,11 @@ loop_vsx: VADDUWM V9, V18, V8 VADDUWM V13, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -334,8 +382,8 @@ loop_vsx: LXVW4X (INP)(R8), VS60 LXVW4X (INP)(R9), VS61 LXVW4X (INP)(R10), VS62 - VXOR V27, V0, V27 + VXOR V27, V0, V27 VXOR V28, V4, V28 VXOR V29, V8, V29 VXOR V30, V12, V30 @@ -354,6 +402,11 @@ loop_vsx: VADDUWM V10, V18, V8 VADDUWM V14, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -381,6 +434,11 @@ loop_vsx: VADDUWM V11, V18, V8 VADDUWM V15, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -408,9 +466,9 @@ loop_vsx: done_vsx: // Increment counter by number of 64 byte blocks - MOVD (CNT), R14 + MOVWZ (CNT), R14 ADD BLOCKS, R14 - MOVD R14, (CNT) + MOVWZ R14, (CNT) RET tail_vsx: From 7cfb9161e8d828fd6d9f34560e78460435b63503 Mon Sep 17 00:00:00 2001 From: samiponkanen Date: Wed, 16 Oct 2024 01:53:41 +0000 Subject: [PATCH 47/57] ssh: return unexpected msg error when server fails keyboard-interactive auth early Seems the OpenSSH server running on windows fails keyboard-interactive auth this way without sending any prompt to client. In such case the golang ssh client should not retry keyboard-interactive auth when the auth method is wrapped in a RetryableAuthMethod(). Rather the auth method should be immediately marked as tried&failed and the client auth process should move on to next available and acceptable auth method. Fixes golang/go#67855 Change-Id: I6c64ae58ff8325774e37af716601b112f8833d8f GitHub-Last-Rev: 7fafc4d1c81284b31000d7d6ccadd934dda26d24 GitHub-Pull-Request: golang/crypto#297 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/590956 LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Ian Lance Taylor Auto-Submit: Nicola Murino Reviewed-by: Nicola Murino --- ssh/client_auth.go | 5 +++ ssh/client_auth_test.go | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/ssh/client_auth.go b/ssh/client_auth.go index b93961010d..b86dde151d 100644 --- a/ssh/client_auth.go +++ b/ssh/client_auth.go @@ -555,6 +555,7 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe } gotMsgExtInfo := false + gotUserAuthInfoRequest := false for { packet, err := c.readPacket() if err != nil { @@ -585,6 +586,9 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe if msg.PartialSuccess { return authPartialSuccess, msg.Methods, nil } + if !gotUserAuthInfoRequest { + return authFailure, msg.Methods, unexpectedMessageError(msgUserAuthInfoRequest, packet[0]) + } return authFailure, msg.Methods, nil case msgUserAuthSuccess: return authSuccess, nil, nil @@ -596,6 +600,7 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe if err := Unmarshal(packet, &msg); err != nil { return authFailure, nil, err } + gotUserAuthInfoRequest = true // Manually unpack the prompt/echo pairs. rest := msg.Prompts diff --git a/ssh/client_auth_test.go b/ssh/client_auth_test.go index e981cc49a6..f11eeb590b 100644 --- a/ssh/client_auth_test.go +++ b/ssh/client_auth_test.go @@ -1293,3 +1293,92 @@ func TestCertAuthOpenSSHCompat(t *testing.T) { t.Fatalf("unable to dial remote side: %s", err) } } + +func TestKeyboardInteractiveAuthEarlyFail(t *testing.T) { + const maxAuthTries = 2 + + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + + // Start testserver + serverConfig := &ServerConfig{ + MaxAuthTries: maxAuthTries, + KeyboardInteractiveCallback: func(c ConnMetadata, + client KeyboardInteractiveChallenge) (*Permissions, error) { + // Fail keyboard-interactive authentication early before + // any prompt is sent to client. + return nil, errors.New("keyboard-interactive auth failed") + }, + PasswordCallback: func(c ConnMetadata, + pass []byte) (*Permissions, error) { + if string(pass) == clientPassword { + return nil, nil + } + return nil, errors.New("password auth failed") + }, + } + serverConfig.AddHostKey(testSigners["rsa"]) + + serverDone := make(chan struct{}) + go func() { + defer func() { serverDone <- struct{}{} }() + conn, chans, reqs, err := NewServerConn(c2, serverConfig) + if err != nil { + return + } + _ = conn.Close() + + discarderDone := make(chan struct{}) + go func() { + defer func() { discarderDone <- struct{}{} }() + DiscardRequests(reqs) + }() + for newChannel := range chans { + newChannel.Reject(Prohibited, + "testserver not accepting requests") + } + + <-discarderDone + }() + + // Connect to testserver, expect KeyboardInteractive() to be not called, + // PasswordCallback() to be called and connection to succeed. + passwordCallbackCalled := false + clientConfig := &ClientConfig{ + User: "testuser", + Auth: []AuthMethod{ + RetryableAuthMethod(KeyboardInteractive(func(name, + instruction string, questions []string, + echos []bool) ([]string, error) { + t.Errorf("unexpected call to KeyboardInteractive()") + return []string{clientPassword}, nil + }), maxAuthTries), + RetryableAuthMethod(PasswordCallback(func() (secret string, + err error) { + t.Logf("PasswordCallback()") + passwordCallbackCalled = true + return clientPassword, nil + }), maxAuthTries), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + conn, _, _, err := NewClientConn(c1, "", clientConfig) + if err != nil { + t.Errorf("unexpected NewClientConn() error: %v", err) + } + if conn != nil { + conn.Close() + } + + // Wait for server to finish. + <-serverDone + + if !passwordCallbackCalled { + t.Errorf("expected PasswordCallback() to be called") + } +} From c17aa50fbd32393e5d52fa65ca51cbfff0a75aea Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 29 Sep 2024 23:27:16 +0200 Subject: [PATCH 48/57] sha3: avoid buffer copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the package worked by copying the input (or the output) into a buffer, and then XOR'ing (or copying) it into (or out of) the state. (Except for an input fast path.) There's no need for that! We can XOR straight into the state, and copy straight out of it, at least on little endian machines. This is a bit faster, almost halves the state size, and will make it easier to implement marshaling, but most importantly look at how much simpler it makes the code! go: go1.23.0 goos: linux goarch: amd64 pkg: golang.org/x/crypto/sha3 cpu: AMD Ryzen 7 PRO 8700GE w/ Radeon 780M Graphics │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ sec/op │ sec/op vs base │ PermutationFunction-8 270.8n ± 0% 270.4n ± 0% ~ (p=0.099 n=10) Sha3_512_MTU-8 5.762µ ± 0% 5.658µ ± 0% -1.80% (p=0.000 n=10) Sha3_384_MTU-8 4.179µ ± 0% 4.070µ ± 0% -2.60% (p=0.000 n=10) Sha3_256_MTU-8 3.316µ ± 0% 3.214µ ± 0% -3.08% (p=0.000 n=10) Sha3_224_MTU-8 3.175µ ± 0% 3.061µ ± 0% -3.61% (p=0.000 n=10) Shake128_MTU-8 2.779µ ± 0% 2.681µ ± 0% -3.51% (p=0.000 n=10) Shake256_MTU-8 2.947µ ± 0% 2.957µ ± 0% +0.32% (p=0.000 n=10) Shake256_16x-8 44.15µ ± 0% 44.45µ ± 0% +0.67% (p=0.000 n=10) Shake256_1MiB-8 2.319m ± 0% 2.274m ± 0% -1.93% (p=0.000 n=10) Sha3_512_1MiB-8 4.204m ± 0% 4.219m ± 0% +0.34% (p=0.000 n=10) geomean 13.75µ 13.54µ -1.55% │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ B/s │ B/s vs base │ PermutationFunction-8 704.3Mi ± 0% 705.4Mi ± 0% ~ (p=0.105 n=10) Sha3_512_MTU-8 223.5Mi ± 0% 227.6Mi ± 0% +1.83% (p=0.000 n=10) Sha3_384_MTU-8 308.1Mi ± 0% 316.4Mi ± 0% +2.67% (p=0.000 n=10) Sha3_256_MTU-8 388.2Mi ± 0% 400.5Mi ± 0% +3.17% (p=0.000 n=10) Sha3_224_MTU-8 405.5Mi ± 0% 420.7Mi ± 0% +3.73% (p=0.000 n=10) Shake128_MTU-8 463.4Mi ± 0% 480.2Mi ± 0% +3.64% (p=0.000 n=10) Shake256_MTU-8 436.9Mi ± 0% 435.5Mi ± 0% -0.32% (p=0.000 n=10) Shake256_16x-8 353.9Mi ± 0% 351.5Mi ± 0% -0.66% (p=0.000 n=10) Shake256_1MiB-8 431.2Mi ± 0% 439.7Mi ± 0% +1.97% (p=0.000 n=10) Sha3_512_1MiB-8 237.8Mi ± 0% 237.1Mi ± 0% -0.33% (p=0.000 n=10) geomean 375.7Mi 381.6Mi +1.57% Even stronger effect when patched on top of CL 616555 (forced on). go: go1.23.0 goos: darwin goarch: arm64 pkg: golang.org/x/crypto/sha3 cpu: Apple M2 │ old │ new │ │ sec/op │ sec/op vs base │ PermutationFunction-8 154.7n ± 2% 153.8n ± 1% ~ (p=0.469 n=10) Sha3_512_MTU-8 3.260µ ± 2% 3.143µ ± 2% -3.60% (p=0.000 n=10) Sha3_384_MTU-8 2.389µ ± 2% 2.244µ ± 2% -6.07% (p=0.000 n=10) Sha3_256_MTU-8 1.950µ ± 2% 1.758µ ± 1% -9.87% (p=0.000 n=10) Sha3_224_MTU-8 1.874µ ± 2% 1.686µ ± 1% -10.06% (p=0.000 n=10) Shake128_MTU-8 1.827µ ± 3% 1.447µ ± 1% -20.80% (p=0.000 n=10) Shake256_MTU-8 1.665µ ± 3% 1.604µ ± 3% -3.63% (p=0.003 n=10) Shake256_16x-8 25.14µ ± 1% 25.23µ ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 1.236m ± 2% 1.243m ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 2.296m ± 2% 2.305m ± 1% ~ (p=0.315 n=10) geomean 7.906µ 7.467µ -5.56% │ old │ new │ │ B/op │ B/op vs base │ PermutationFunction-8 1.204Gi ± 2% 1.212Gi ± 1% ~ (p=0.529 n=10) Sha3_512_MTU-8 394.9Mi ± 2% 409.7Mi ± 2% +3.73% (p=0.000 n=10) Sha3_384_MTU-8 539.0Mi ± 2% 573.8Mi ± 2% +6.45% (p=0.000 n=10) Sha3_256_MTU-8 660.3Mi ± 2% 732.6Mi ± 1% +10.95% (p=0.000 n=10) Sha3_224_MTU-8 687.1Mi ± 2% 763.9Mi ± 1% +11.17% (p=0.000 n=10) Shake128_MTU-8 704.7Mi ± 2% 889.6Mi ± 2% +26.24% (p=0.000 n=10) Shake256_MTU-8 773.4Mi ± 3% 802.5Mi ± 3% +3.76% (p=0.004 n=10) Shake256_16x-8 621.6Mi ± 1% 619.3Mi ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 809.1Mi ± 2% 804.7Mi ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 435.6Mi ± 2% 433.9Mi ± 1% ~ (p=0.315 n=10) geomean 653.6Mi 692.0Mi +5.88% Change-Id: I33a0a1ddf305c395f99bf17f81473e2f42c5ce42 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616575 Reviewed-by: Daniel McCarney Reviewed-by: Michael Pratt Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda Reviewed-by: Andrew Ekstedt --- sha3/sha3.go | 113 +++++++++++++++++++++++---------------------------- sha3/xor.go | 40 ------------------ 2 files changed, 50 insertions(+), 103 deletions(-) delete mode 100644 sha3/xor.go diff --git a/sha3/sha3.go b/sha3/sha3.go index afedde5abf..bda574e9d6 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -4,6 +4,14 @@ package sha3 +import ( + "crypto/subtle" + "encoding/binary" + "unsafe" + + "golang.org/x/sys/cpu" +) + // spongeDirection indicates the direction bytes are flowing through the sponge. type spongeDirection int @@ -14,16 +22,13 @@ const ( spongeSqueezing ) -const ( - // maxRate is the maximum size of the internal buffer. SHAKE-256 - // currently needs the largest buffer. - maxRate = 168 -) - type state struct { - // Generic sponge components. - a [25]uint64 // main state of the hash - rate int // the number of bytes of state to use + a [1600 / 8]byte // main state of the hash + + // a[n:rate] is the buffer. If absorbing, it's the remaining space to XOR + // into before running the permutation. If squeezing, it's the remaining + // output to produce before running the permutation. + n, rate int // dsbyte contains the "domain separation" bits and the first bit of // the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the @@ -39,10 +44,6 @@ type state struct { // Extendable-Output Functions (May 2014)" dsbyte byte - i, n int // storage[i:n] is the buffer, i is only used while squeezing - storage [maxRate]byte - - // Specific to SHA-3 and SHAKE. outputLen int // the default output size in bytes state spongeDirection // whether the sponge is absorbing or squeezing } @@ -61,7 +62,7 @@ func (d *state) Reset() { d.a[i] = 0 } d.state = spongeAbsorbing - d.i, d.n = 0, 0 + d.n = 0 } func (d *state) clone() *state { @@ -69,22 +70,25 @@ func (d *state) clone() *state { return &ret } -// permute applies the KeccakF-1600 permutation. It handles -// any input-output buffering. +// permute applies the KeccakF-1600 permutation. func (d *state) permute() { - switch d.state { - case spongeAbsorbing: - // If we're absorbing, we need to xor the input into the state - // before applying the permutation. - xorIn(d, d.storage[:d.rate]) - d.n = 0 - keccakF1600(&d.a) - case spongeSqueezing: - // If we're squeezing, we need to apply the permutation before - // copying more output. - keccakF1600(&d.a) - d.i = 0 - copyOut(d, d.storage[:d.rate]) + var a *[25]uint64 + if cpu.IsBigEndian { + a = new([25]uint64) + for i := range a { + a[i] = binary.LittleEndian.Uint64(d.a[i*8:]) + } + } else { + a = (*[25]uint64)(unsafe.Pointer(&d.a)) + } + + keccakF1600(a) + d.n = 0 + + if cpu.IsBigEndian { + for i := range a { + binary.LittleEndian.PutUint64(d.a[i*8:], a[i]) + } } } @@ -92,53 +96,36 @@ func (d *state) permute() { // the multi-bitrate 10..1 padding rule, and permutes the state. func (d *state) padAndPermute() { // Pad with this instance's domain-separator bits. We know that there's - // at least one byte of space in d.buf because, if it were full, + // at least one byte of space in the sponge because, if it were full, // permute would have been called to empty it. dsbyte also contains the // first one bit for the padding. See the comment in the state struct. - d.storage[d.n] = d.dsbyte - d.n++ - for d.n < d.rate { - d.storage[d.n] = 0 - d.n++ - } + d.a[d.n] ^= d.dsbyte // This adds the final one bit for the padding. Because of the way that // bits are numbered from the LSB upwards, the final bit is the MSB of // the last byte. - d.storage[d.rate-1] ^= 0x80 + d.a[d.rate-1] ^= 0x80 // Apply the permutation d.permute() d.state = spongeSqueezing - d.n = d.rate - copyOut(d, d.storage[:d.rate]) } // Write absorbs more data into the hash's state. It panics if any // output has already been read. -func (d *state) Write(p []byte) (written int, err error) { +func (d *state) Write(p []byte) (n int, err error) { if d.state != spongeAbsorbing { panic("sha3: Write after Read") } - written = len(p) + + n = len(p) for len(p) > 0 { - if d.n == 0 && len(p) >= d.rate { - // The fast path; absorb a full "rate" bytes of input and apply the permutation. - xorIn(d, p[:d.rate]) - p = p[d.rate:] - keccakF1600(&d.a) - } else { - // The slow path; buffer the input until we can fill the sponge, and then xor it in. - todo := d.rate - d.n - if todo > len(p) { - todo = len(p) - } - d.n += copy(d.storage[d.n:], p[:todo]) - p = p[todo:] - - // If the sponge is full, apply the permutation. - if d.n == d.rate { - d.permute() - } + x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p) + d.n += x + p = p[x:] + + // If the sponge is full, apply the permutation. + if d.n == d.rate { + d.permute() } } @@ -156,12 +143,12 @@ func (d *state) Read(out []byte) (n int, err error) { // Now, do the squeezing. for len(out) > 0 { - n := copy(out, d.storage[d.i:d.n]) - d.i += n - out = out[n:] + x := copy(out, d.a[d.n:d.rate]) + d.n += x + out = out[x:] // Apply the permutation if we've squeezed the sponge dry. - if d.i == d.rate { + if d.n == d.rate { d.permute() } } diff --git a/sha3/xor.go b/sha3/xor.go deleted file mode 100644 index 6ada5c9574..0000000000 --- a/sha3/xor.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package sha3 - -import ( - "crypto/subtle" - "encoding/binary" - "unsafe" - - "golang.org/x/sys/cpu" -) - -// xorIn xors the bytes in buf into the state. -func xorIn(d *state, buf []byte) { - if cpu.IsBigEndian { - for i := 0; len(buf) >= 8; i++ { - a := binary.LittleEndian.Uint64(buf) - d.a[i] ^= a - buf = buf[8:] - } - } else { - ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) - subtle.XORBytes(ab[:], ab[:], buf) - } -} - -// copyOut copies uint64s to a byte buffer. -func copyOut(d *state, b []byte) { - if cpu.IsBigEndian { - for i := 0; len(b) >= 8; i++ { - binary.LittleEndian.PutUint64(b, d.a[i]) - b = b[8:] - } - } else { - ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) - copy(b, ab[:]) - } -} From 80ea76eb17c0c52f5d5d04e833d6aeb6b062d81d Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 30 Sep 2024 00:57:48 +0200 Subject: [PATCH 49/57] sha3: fix padding for long cSHAKE parameters We used to compute the incorrect value if len(initBlock) % rate == 0. Also, add a test vector for golang/go#66232, confirmed to fail on GOARCH=386 without CL 570876. Fixes golang/go#69169 Change-Id: I3f2400926fca111dd0ca1327d6b5975e51b28f96 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616576 Reviewed-by: Andrew Ekstedt Reviewed-by: Daniel McCarney Reviewed-by: Michael Pratt LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda Reviewed-by: Roland Shoemaker --- sha3/sha3_test.go | 111 ++++++++++++++++++++++++++++++++++++++++++++++ sha3/shake.go | 45 ++++++++++--------- 2 files changed, 134 insertions(+), 22 deletions(-) diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index 21e8cbad7b..8347b60d10 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -17,6 +17,7 @@ import ( "encoding/json" "fmt" "hash" + "io" "math/rand" "os" "strings" @@ -375,6 +376,116 @@ func TestClone(t *testing.T) { } } +func TestCSHAKEAccumulated(t *testing.T) { + // Generated with pycryptodome@3.20.0 + // + // from Crypto.Hash import cSHAKE128 + // rng = cSHAKE128.new() + // acc = cSHAKE128.new() + // for n in range(200): + // N = rng.read(n) + // for s in range(200): + // S = rng.read(s) + // c = cSHAKE128.cSHAKE_XOF(data=None, custom=S, capacity=256, function=N) + // c.update(rng.read(100)) + // acc.update(c.read(200)) + // c = cSHAKE128.cSHAKE_XOF(data=None, custom=S, capacity=256, function=N) + // c.update(rng.read(168)) + // acc.update(c.read(200)) + // c = cSHAKE128.cSHAKE_XOF(data=None, custom=S, capacity=256, function=N) + // c.update(rng.read(200)) + // acc.update(c.read(200)) + // print(acc.read(32).hex()) + // + // and with @noble/hashes@v1.5.0 + // + // import { bytesToHex } from "@noble/hashes/utils"; + // import { cshake128 } from "@noble/hashes/sha3-addons"; + // const rng = cshake128.create(); + // const acc = cshake128.create(); + // for (let n = 0; n < 200; n++) { + // const N = rng.xof(n); + // for (let s = 0; s < 200; s++) { + // const S = rng.xof(s); + // let c = cshake128.create({ NISTfn: N, personalization: S }); + // c.update(rng.xof(100)); + // acc.update(c.xof(200)); + // c = cshake128.create({ NISTfn: N, personalization: S }); + // c.update(rng.xof(168)); + // acc.update(c.xof(200)); + // c = cshake128.create({ NISTfn: N, personalization: S }); + // c.update(rng.xof(200)); + // acc.update(c.xof(200)); + // } + // } + // console.log(bytesToHex(acc.xof(32))); + // + t.Run("cSHAKE128", func(t *testing.T) { + testCSHAKEAccumulated(t, NewCShake128, rate128, + "bb14f8657c6ec5403d0b0e2ef3d3393497e9d3b1a9a9e8e6c81dbaa5fd809252") + }) + t.Run("cSHAKE256", func(t *testing.T) { + testCSHAKEAccumulated(t, NewCShake256, rate256, + "0baaf9250c6e25f0c14ea5c7f9bfde54c8a922c8276437db28f3895bdf6eeeef") + }) +} + +func testCSHAKEAccumulated(t *testing.T, newCShake func(N, S []byte) ShakeHash, rate int64, exp string) { + rnd := newCShake(nil, nil) + acc := newCShake(nil, nil) + for n := 0; n < 200; n++ { + N := make([]byte, n) + rnd.Read(N) + for s := 0; s < 200; s++ { + S := make([]byte, s) + rnd.Read(S) + + c := newCShake(N, S) + io.CopyN(c, rnd, 100 /* < rate */) + io.CopyN(acc, c, 200) + + c.Reset() + io.CopyN(c, rnd, rate) + io.CopyN(acc, c, 200) + + c.Reset() + io.CopyN(c, rnd, 200 /* > rate */) + io.CopyN(acc, c, 200) + } + } + if got := hex.EncodeToString(acc.Sum(nil)[:32]); got != exp { + t.Errorf("got %s, want %s", got, exp) + } +} + +func TestCSHAKELargeS(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + // See https://go.dev/issue/66232. + const s = (1<<32)/8 + 1000 // s * 8 > 2^32 + S := make([]byte, s) + rnd := NewShake128() + rnd.Read(S) + c := NewCShake128(nil, S) + io.CopyN(c, rnd, 1000) + + // Generated with pycryptodome@3.20.0 + // + // from Crypto.Hash import cSHAKE128 + // rng = cSHAKE128.new() + // S = rng.read(536871912) + // c = cSHAKE128.new(custom=S) + // c.update(rng.read(1000)) + // print(c.read(32).hex()) + // + exp := "2cb9f237767e98f2614b8779cf096a52da9b3a849280bbddec820771ae529cf0" + if got := hex.EncodeToString(c.Sum(nil)); got != exp { + t.Errorf("got %s, want %s", got, exp) + } +} + // BenchmarkPermutationFunction measures the speed of the permutation function // with no input data. func BenchmarkPermutationFunction(b *testing.B) { diff --git a/sha3/shake.go b/sha3/shake.go index a01ef43577..6d75811fd4 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -19,6 +19,7 @@ import ( "encoding/binary" "hash" "io" + "math/bits" ) // ShakeHash defines the interface to hash functions that support @@ -58,33 +59,33 @@ const ( rate256 = 136 ) -func bytepad(input []byte, w int) []byte { - // leftEncode always returns max 9 bytes - buf := make([]byte, 0, 9+len(input)+w) - buf = append(buf, leftEncode(uint64(w))...) - buf = append(buf, input...) - padlen := w - (len(buf) % w) - return append(buf, make([]byte, padlen)...) -} - -func leftEncode(value uint64) []byte { - var b [9]byte - binary.BigEndian.PutUint64(b[1:], value) - // Trim all but last leading zero bytes - i := byte(1) - for i < 8 && b[i] == 0 { - i++ +func bytepad(data []byte, rate int) []byte { + out := make([]byte, 0, 9+len(data)+rate-1) + out = append(out, leftEncode(uint64(rate))...) + out = append(out, data...) + if padlen := rate - len(out)%rate; padlen < rate { + out = append(out, make([]byte, padlen)...) } - // Prepend number of encoded bytes - b[i-1] = 9 - i - return b[i-1:] + return out +} + +func leftEncode(x uint64) []byte { + // Let n be the smallest positive integer for which 2^(8n) > x. + n := (bits.Len64(x) + 7) / 8 + if n == 0 { + n = 1 + } + // Return n || x with n as a byte and x an n bytes in big-endian order. + b := make([]byte, 9) + binary.BigEndian.PutUint64(b[1:], x) + b = b[9-n-1:] + b[0] = byte(n) + return b } func newCShake(N, S []byte, rate, outputLen int, dsbyte byte) ShakeHash { c := cshakeState{state: &state{rate: rate, outputLen: outputLen, dsbyte: dsbyte}} - - // leftEncode returns max 9 bytes - c.initBlock = make([]byte, 0, 9*2+len(N)+len(S)) + c.initBlock = make([]byte, 0, 9+len(N)+9+len(S)) // leftEncode returns max 9 bytes c.initBlock = append(c.initBlock, leftEncode(uint64(len(N))*8)...) c.initBlock = append(c.initBlock, N...) c.initBlock = append(c.initBlock, leftEncode(uint64(len(S))*8)...) From 36b172546bd03a74c79e109ec84c599b672ea9e4 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Wed, 2 Oct 2024 12:44:13 +0200 Subject: [PATCH 50/57] sha3: avoid trailing permutation If you read a multiple of the rate, and then stop, there is no point in running the final permutation. Change-Id: Ic95e70f78b6e139aca1d3e3c11e09d2bbcf54f6c Reviewed-on: https://go-review.googlesource.com/c/crypto/+/620555 Reviewed-by: Daniel McCarney Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt Auto-Submit: Filippo Valsorda --- sha3/sha3.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sha3/sha3.go b/sha3/sha3.go index bda574e9d6..4f5caddb01 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -143,14 +143,14 @@ func (d *state) Read(out []byte) (n int, err error) { // Now, do the squeezing. for len(out) > 0 { - x := copy(out, d.a[d.n:d.rate]) - d.n += x - out = out[x:] - // Apply the permutation if we've squeezed the sponge dry. if d.n == d.rate { d.permute() } + + x := copy(out, d.a[d.n:d.rate]) + d.n += x + out = out[x:] } return From 750a45fe5e473d5afa193e9088f3d135e64eca26 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 30 Sep 2024 13:39:09 +0200 Subject: [PATCH 51/57] sha3: add MarshalBinary, AppendBinary, and UnmarshalBinary Fixes golang/go#24617 Change-Id: I1d9d529950aa8a5953435e8d3412cda44b075d55 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616635 Reviewed-by: Roland Shoemaker Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Daniel McCarney Reviewed-by: Michael Pratt --- sha3/doc.go | 4 +++ sha3/hashes.go | 31 ++++++++++++++++---- sha3/sha3.go | 72 +++++++++++++++++++++++++++++++++++++++++++++++ sha3/sha3_test.go | 42 +++++++++++++++++++++++++-- sha3/shake.go | 42 +++++++++++++++++++-------- 5 files changed, 171 insertions(+), 20 deletions(-) diff --git a/sha3/doc.go b/sha3/doc.go index 7e02309070..bbf391fe6e 100644 --- a/sha3/doc.go +++ b/sha3/doc.go @@ -5,6 +5,10 @@ // Package sha3 implements the SHA-3 fixed-output-length hash functions and // the SHAKE variable-output-length hash functions defined by FIPS-202. // +// All types in this package also implement [encoding.BinaryMarshaler], +// [encoding.BinaryAppender] and [encoding.BinaryUnmarshaler] to marshal and +// unmarshal the internal state of the hash. +// // Both types of hash function use the "sponge" construction and the Keccak // permutation. For a detailed specification see http://keccak.noekeon.org/ // diff --git a/sha3/hashes.go b/sha3/hashes.go index c544b29e5f..31fffbe044 100644 --- a/sha3/hashes.go +++ b/sha3/hashes.go @@ -48,33 +48,52 @@ func init() { crypto.RegisterHash(crypto.SHA3_512, New512) } +const ( + dsbyteSHA3 = 0b00000110 + dsbyteKeccak = 0b00000001 + dsbyteShake = 0b00011111 + dsbyteCShake = 0b00000100 + + // rateK[c] is the rate in bytes for Keccak[c] where c is the capacity in + // bits. Given the sponge size is 1600 bits, the rate is 1600 - c bits. + rateK256 = (1600 - 256) / 8 + rateK448 = (1600 - 448) / 8 + rateK512 = (1600 - 512) / 8 + rateK768 = (1600 - 768) / 8 + rateK1024 = (1600 - 1024) / 8 +) + func new224Generic() *state { - return &state{rate: 144, outputLen: 28, dsbyte: 0x06} + return &state{rate: rateK448, outputLen: 28, dsbyte: dsbyteSHA3} } func new256Generic() *state { - return &state{rate: 136, outputLen: 32, dsbyte: 0x06} + return &state{rate: rateK512, outputLen: 32, dsbyte: dsbyteSHA3} } func new384Generic() *state { - return &state{rate: 104, outputLen: 48, dsbyte: 0x06} + return &state{rate: rateK768, outputLen: 48, dsbyte: dsbyteSHA3} } func new512Generic() *state { - return &state{rate: 72, outputLen: 64, dsbyte: 0x06} + return &state{rate: rateK1024, outputLen: 64, dsbyte: dsbyteSHA3} } // NewLegacyKeccak256 creates a new Keccak-256 hash. // // Only use this function if you require compatibility with an existing cryptosystem // that uses non-standard padding. All other users should use New256 instead. -func NewLegacyKeccak256() hash.Hash { return &state{rate: 136, outputLen: 32, dsbyte: 0x01} } +func NewLegacyKeccak256() hash.Hash { + return &state{rate: rateK512, outputLen: 32, dsbyte: dsbyteKeccak} +} // NewLegacyKeccak512 creates a new Keccak-512 hash. // // Only use this function if you require compatibility with an existing cryptosystem // that uses non-standard padding. All other users should use New512 instead. -func NewLegacyKeccak512() hash.Hash { return &state{rate: 72, outputLen: 64, dsbyte: 0x01} } +func NewLegacyKeccak512() hash.Hash { + return &state{rate: rateK1024, outputLen: 64, dsbyte: dsbyteKeccak} +} // Sum224 returns the SHA3-224 digest of the data. func Sum224(data []byte) (digest [28]byte) { diff --git a/sha3/sha3.go b/sha3/sha3.go index 4f5caddb01..6658c44479 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -7,6 +7,7 @@ package sha3 import ( "crypto/subtle" "encoding/binary" + "errors" "unsafe" "golang.org/x/sys/cpu" @@ -170,3 +171,74 @@ func (d *state) Sum(in []byte) []byte { dup.Read(hash) return append(in, hash...) } + +const ( + magicSHA3 = "sha\x08" + magicShake = "sha\x09" + magicCShake = "sha\x0a" + magicKeccak = "sha\x0b" + // magic || rate || main state || n || sponge direction + marshaledSize = len(magicSHA3) + 1 + 200 + 1 + 1 +) + +func (d *state) MarshalBinary() ([]byte, error) { + return d.AppendBinary(make([]byte, 0, marshaledSize)) +} + +func (d *state) AppendBinary(b []byte) ([]byte, error) { + switch d.dsbyte { + case dsbyteSHA3: + b = append(b, magicSHA3...) + case dsbyteShake: + b = append(b, magicShake...) + case dsbyteCShake: + b = append(b, magicCShake...) + case dsbyteKeccak: + b = append(b, magicKeccak...) + default: + panic("unknown dsbyte") + } + // rate is at most 168, and n is at most rate. + b = append(b, byte(d.rate)) + b = append(b, d.a[:]...) + b = append(b, byte(d.n), byte(d.state)) + return b, nil +} + +func (d *state) UnmarshalBinary(b []byte) error { + if len(b) != marshaledSize { + return errors.New("sha3: invalid hash state") + } + + magic := string(b[:len(magicSHA3)]) + b = b[len(magicSHA3):] + switch { + case magic == magicSHA3 && d.dsbyte == dsbyteSHA3: + case magic == magicShake && d.dsbyte == dsbyteShake: + case magic == magicCShake && d.dsbyte == dsbyteCShake: + case magic == magicKeccak && d.dsbyte == dsbyteKeccak: + default: + return errors.New("sha3: invalid hash state identifier") + } + + rate := int(b[0]) + b = b[1:] + if rate != d.rate { + return errors.New("sha3: invalid hash state function") + } + + copy(d.a[:], b) + b = b[len(d.a):] + + n, state := int(b[0]), spongeDirection(b[1]) + if n > d.rate { + return errors.New("sha3: invalid hash state") + } + d.n = n + if state != spongeAbsorbing && state != spongeSqueezing { + return errors.New("sha3: invalid hash state") + } + d.state = state + + return nil +} diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go index 8347b60d10..d97a970b1a 100644 --- a/sha3/sha3_test.go +++ b/sha3/sha3_test.go @@ -13,6 +13,7 @@ package sha3 import ( "bytes" "compress/flate" + "encoding" "encoding/hex" "encoding/json" "fmt" @@ -421,11 +422,11 @@ func TestCSHAKEAccumulated(t *testing.T) { // console.log(bytesToHex(acc.xof(32))); // t.Run("cSHAKE128", func(t *testing.T) { - testCSHAKEAccumulated(t, NewCShake128, rate128, + testCSHAKEAccumulated(t, NewCShake128, rateK256, "bb14f8657c6ec5403d0b0e2ef3d3393497e9d3b1a9a9e8e6c81dbaa5fd809252") }) t.Run("cSHAKE256", func(t *testing.T) { - testCSHAKEAccumulated(t, NewCShake256, rate256, + testCSHAKEAccumulated(t, NewCShake256, rateK512, "0baaf9250c6e25f0c14ea5c7f9bfde54c8a922c8276437db28f3895bdf6eeeef") }) } @@ -486,6 +487,43 @@ func TestCSHAKELargeS(t *testing.T) { } } +func TestMarshalUnmarshal(t *testing.T) { + t.Run("SHA3-224", func(t *testing.T) { testMarshalUnmarshal(t, New224()) }) + t.Run("SHA3-256", func(t *testing.T) { testMarshalUnmarshal(t, New256()) }) + t.Run("SHA3-384", func(t *testing.T) { testMarshalUnmarshal(t, New384()) }) + t.Run("SHA3-512", func(t *testing.T) { testMarshalUnmarshal(t, New512()) }) + t.Run("SHAKE128", func(t *testing.T) { testMarshalUnmarshal(t, NewShake128()) }) + t.Run("SHAKE256", func(t *testing.T) { testMarshalUnmarshal(t, NewShake256()) }) + t.Run("cSHAKE128", func(t *testing.T) { testMarshalUnmarshal(t, NewCShake128([]byte("N"), []byte("S"))) }) + t.Run("cSHAKE256", func(t *testing.T) { testMarshalUnmarshal(t, NewCShake256([]byte("N"), []byte("S"))) }) + t.Run("Keccak-256", func(t *testing.T) { testMarshalUnmarshal(t, NewLegacyKeccak256()) }) + t.Run("Keccak-512", func(t *testing.T) { testMarshalUnmarshal(t, NewLegacyKeccak512()) }) +} + +// TODO(filippo): move this to crypto/internal/cryptotest. +func testMarshalUnmarshal(t *testing.T, h hash.Hash) { + buf := make([]byte, 200) + rand.Read(buf) + n := rand.Intn(200) + h.Write(buf) + want := h.Sum(nil) + h.Reset() + h.Write(buf[:n]) + b, err := h.(encoding.BinaryMarshaler).MarshalBinary() + if err != nil { + t.Errorf("MarshalBinary: %v", err) + } + h.Write(bytes.Repeat([]byte{0}, 200)) + if err := h.(encoding.BinaryUnmarshaler).UnmarshalBinary(b); err != nil { + t.Errorf("UnmarshalBinary: %v", err) + } + h.Write(buf[n:]) + got := h.Sum(nil) + if !bytes.Equal(got, want) { + t.Errorf("got %x, want %x", got, want) + } +} + // BenchmarkPermutationFunction measures the speed of the permutation function // with no input data. func BenchmarkPermutationFunction(b *testing.B) { diff --git a/sha3/shake.go b/sha3/shake.go index 6d75811fd4..a6b3a4281f 100644 --- a/sha3/shake.go +++ b/sha3/shake.go @@ -16,7 +16,9 @@ package sha3 // [2] https://doi.org/10.6028/NIST.SP.800-185 import ( + "bytes" "encoding/binary" + "errors" "hash" "io" "math/bits" @@ -51,14 +53,6 @@ type cshakeState struct { initBlock []byte } -// Consts for configuring initial SHA-3 state -const ( - dsbyteShake = 0x1f - dsbyteCShake = 0x04 - rate128 = 168 - rate256 = 136 -) - func bytepad(data []byte, rate int) []byte { out := make([]byte, 0, 9+len(data)+rate-1) out = append(out, leftEncode(uint64(rate))...) @@ -112,6 +106,30 @@ func (c *state) Clone() ShakeHash { return c.clone() } +func (c *cshakeState) MarshalBinary() ([]byte, error) { + return c.AppendBinary(make([]byte, 0, marshaledSize+len(c.initBlock))) +} + +func (c *cshakeState) AppendBinary(b []byte) ([]byte, error) { + b, err := c.state.AppendBinary(b) + if err != nil { + return nil, err + } + b = append(b, c.initBlock...) + return b, nil +} + +func (c *cshakeState) UnmarshalBinary(b []byte) error { + if len(b) <= marshaledSize { + return errors.New("sha3: invalid hash state") + } + if err := c.state.UnmarshalBinary(b[:marshaledSize]); err != nil { + return err + } + c.initBlock = bytes.Clone(b[marshaledSize:]) + return nil +} + // NewShake128 creates a new SHAKE128 variable-output-length ShakeHash. // Its generic security strength is 128 bits against all attacks if at // least 32 bytes of its output are used. @@ -127,11 +145,11 @@ func NewShake256() ShakeHash { } func newShake128Generic() *state { - return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake} + return &state{rate: rateK256, outputLen: 32, dsbyte: dsbyteShake} } func newShake256Generic() *state { - return &state{rate: rate256, outputLen: 64, dsbyte: dsbyteShake} + return &state{rate: rateK512, outputLen: 64, dsbyte: dsbyteShake} } // NewCShake128 creates a new instance of cSHAKE128 variable-output-length ShakeHash, @@ -144,7 +162,7 @@ func NewCShake128(N, S []byte) ShakeHash { if len(N) == 0 && len(S) == 0 { return NewShake128() } - return newCShake(N, S, rate128, 32, dsbyteCShake) + return newCShake(N, S, rateK256, 32, dsbyteCShake) } // NewCShake256 creates a new instance of cSHAKE256 variable-output-length ShakeHash, @@ -157,7 +175,7 @@ func NewCShake256(N, S []byte) ShakeHash { if len(N) == 0 && len(S) == 0 { return NewShake256() } - return newCShake(N, S, rate256, 64, dsbyteCShake) + return newCShake(N, S, rateK512, 64, dsbyteCShake) } // ShakeSum128 writes an arbitrary-length digest of data into hash. From 71ed71b4faf97caafd1863fed003e9ac311f10ee Mon Sep 17 00:00:00 2001 From: Ian Lance Taylor Date: Thu, 31 Oct 2024 15:31:36 -0700 Subject: [PATCH 52/57] README: don't recommend go get These days people will just import the packages and the go tool will do the right thing. We don't need to explain it. Add a pointer to the git repo, though. For golang/go#62645 Change-Id: I8b1e4a877bd83fe6891688a44d27a6c7902c8979 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/624155 LUCI-TryBot-Result: Go LUCI Commit-Queue: Ian Lance Taylor Reviewed-by: Ian Lance Taylor Auto-Submit: Ian Lance Taylor Reviewed-by: Roland Shoemaker --- README.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 92f73cdfbf..f69c623ff8 100644 --- a/README.md +++ b/README.md @@ -2,20 +2,17 @@ [![Go Reference](https://pkg.go.dev/badge/golang.org/x/crypto.svg)](https://pkg.go.dev/golang.org/x/crypto) -This repository holds supplementary Go cryptography libraries. - -## Download/Install - -The easiest way to install is to run `go get -u golang.org/x/crypto/...`. You -can also manually git clone the repository to `$GOPATH/src/golang.org/x/crypto`. +This repository holds supplementary Go cryptography packages. ## Report Issues / Send Patches This repository uses Gerrit for code changes. To learn how to submit changes to -this repository, see https://golang.org/doc/contribute.html. +this repository, see https://go.dev/doc/contribute. + +The git repository is https://go.googlesource.com/crypto. The main issue tracker for the crypto repository is located at -https://github.com/golang/go/issues. Prefix your issue with "x/crypto:" in the +https://go.dev/issues. Prefix your issue with "x/crypto:" in the subject line, so it is easy to find. Note that contributions to the cryptography package receive additional scrutiny From 6018723c74059e3b91c84268b212c2f6cdab1f64 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Thu, 7 Nov 2024 22:09:40 +0000 Subject: [PATCH 53/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: Ib4976eb0b062bcd71c208afc9ff53e8c3068fbf9 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/626377 Reviewed-by: David Chase Auto-Submit: Gopher Robot Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index ecf61b3e18..d4bb2e13f3 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.26.0 - golang.org/x/term v0.25.0 + golang.org/x/sys v0.27.0 + golang.org/x/term v0.26.0 ) -require golang.org/x/text v0.19.0 // indirect +require golang.org/x/text v0.20.0 // indirect diff --git a/go.sum b/go.sum index 5b12ab8eee..b626a3d76d 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= -golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= +golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= From 8c4e668694ccbaa1be4785da7e7a40f2ef93152b Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Mon, 25 Nov 2024 16:01:21 +0000 Subject: [PATCH 54/57] x509roots/fallback: update bundle This is an automated CL which updates the NSS root bundle. Change-Id: Ic5267bf9d66b676e1cfc5fc2ae153afb8f33b29c Reviewed-on: https://go-review.googlesource.com/c/crypto/+/631635 Auto-Submit: Gopher Robot Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI --- x509roots/fallback/bundle.go | 290 ----------------------------------- 1 file changed, 290 deletions(-) diff --git a/x509roots/fallback/bundle.go b/x509roots/fallback/bundle.go index 80ce10fe3c..e025bd5f54 100644 --- a/x509roots/fallback/bundle.go +++ b/x509roots/fallback/bundle.go @@ -161,98 +161,6 @@ ZCzJJ7VLkn5l/9Mt4blOvH+kQSGQQXemOR/qnuOf0GZvBeyqdn6/axag67XH/JJU LysRJyU3eExRarDzzFhdFPFqSBX/wge2sY0PjlxQRrM9vwGYT7JZVEc+NHt4bVaT LnPqZih4zR0Uv6CPLy64Lo7yFIrM6bV8+2ydDKXhlg== -----END CERTIFICATE----- -# CN=AffirmTrust Commercial,O=AffirmTrust,C=US -# 0376ab1d54c5f9803ce4b2e201a0ee7eef7b57b636e8a93c9b8d4860c96f5fa7 ------BEGIN CERTIFICATE----- -MIIDTDCCAjSgAwIBAgIId3cGJyapsXwwDQYJKoZIhvcNAQELBQAwRDELMAkGA1UE -BhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZpcm1UcnVz -dCBDb21tZXJjaWFsMB4XDTEwMDEyOTE0MDYwNloXDTMwMTIzMTE0MDYwNlowRDEL -MAkGA1UEBhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZp -cm1UcnVzdCBDb21tZXJjaWFsMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC -AQEA9htPZwcroRX1BiLLHwGy43NFBkRJLLtJJRTWzsO3qyxPxkEylFf6EqdbDuKP -Hx6GGaeqtS25Xw2Kwq+FNXkyLbscYjfysVtKPcrNcV/pQr6U6Mje+SJIZMblq8Yr -ba0F8PrVC8+a5fBQpIs7R6UjW3p6+DM/uO+Zl+MgwdYoic+U+7lF7eNAFxHUdPAL -MeIrJmqbTFeurCA+ukV6BfO9m2kVrn1OIGPENXY6BwLJN/3HR+7o8XYdcxXyl6S1 -yHp52UKqK39c/s4mT6NmgTWvRLpUHhwwMmWd5jyTXlBOeuM61G7MGvv50jeuJCqr -VwMiKA1JdX+3KNp1v47j3A55MQIDAQABo0IwQDAdBgNVHQ4EFgQUnZPGU4teyq8/ -nx4P5ZmVvCT2lI8wDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwDQYJ -KoZIhvcNAQELBQADggEBAFis9AQOzcAN/wr91LoWXym9e2iZWEnStB03TX8nfUYG -XUPGhi4+c7ImfU+TqbbEKpqrIZcUsd6M06uJFdhrJNTxFq7YpFzUf1GO7RgBsZNj -vbz4YYCanrHOQnDiqX0GJX0nof5v7LMeJNrjS1UaADs1tDvZ110w/YETifLCBivt -Z8SOyUOyXGsViQK8YvxO8rUzqrJv0wqiUOP2O+guRMLbZjipM1ZI8W0bM40NjD9g -N53Tym1+NH4Nn3J2ixufcv1SNUFFApYvHLKac0khsUlHRUe072o0EclNmsxZt9YC -nlpOZbWUrhvfKbAW8b8Angc6F2S1BLUjIZkKlTuXfO8= ------END CERTIFICATE----- -# CN=AffirmTrust Networking,O=AffirmTrust,C=US -# 0a81ec5a929777f145904af38d5d509f66b5e2c58fcdb531058b0e17f3f0b41b ------BEGIN CERTIFICATE----- -MIIDTDCCAjSgAwIBAgIIfE8EORzUmS0wDQYJKoZIhvcNAQEFBQAwRDELMAkGA1UE -BhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZpcm1UcnVz -dCBOZXR3b3JraW5nMB4XDTEwMDEyOTE0MDgyNFoXDTMwMTIzMTE0MDgyNFowRDEL -MAkGA1UEBhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MR8wHQYDVQQDDBZBZmZp -cm1UcnVzdCBOZXR3b3JraW5nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC -AQEAtITMMxcua5Rsa2FSoOujz3mUTOWUgJnLVWREZY9nZOIG41w3SfYvm4SEHi3y -YJ0wTsyEheIszx6e/jarM3c1RNg1lho9Nuh6DtjVR6FqaYvZ/Ls6rnla1fTWcbua -kCNrmreIdIcMHl+5ni36q1Mr3Lt2PpNMCAiMHqIjHNRqrSK6mQEubWXLviRmVSRL -QESxG9fhwoXA3hA/Pe24/PHxI1Pcv2WXb9n5QHGNfb2V1M6+oF4nI979ptAmDgAp -6zxG8D1gvz9Q0twmQVGeFDdCBKNwV6gbh+0t+nvujArjqWaJGctB+d1ENmHP4ndG -yH329JKBNv3bNPFyfvMMFr20FQIDAQABo0IwQDAdBgNVHQ4EFgQUBx/S55zawm6i -QLSwelAQUHTEyL0wDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwDQYJ -KoZIhvcNAQEFBQADggEBAIlXshZ6qML91tmbmzTCnLQyFE2npN/svqe++EPbkTfO -tDIuUFUaNU52Q3Eg75N3ThVwLofDwR1t3Mu1J9QsVtFSUzpE0nPIxBsFZVpikpzu -QY0x2+c06lkh1QF612S4ZDnNye2v7UsDSKegmQGA3GWjNq5lWUhPgkvIZfFXHeVZ -Lgo/bNjR9eUJtGxUAArgFU2HdW23WJZa3W3SAKD0m0i+wzekujbgfIeFlxoVot4u -olu9rxj5kFDNcFn4J2dHy8egBzp90SxdbBk6ZrV9/ZFvgrG+CJPbFEfxojfHRZ48 -x3evZKiT3/Zpg4Jg8klCNO1aAFSFHBY2kgxc+qatv9s= ------END CERTIFICATE----- -# CN=AffirmTrust Premium ECC,O=AffirmTrust,C=US -# bd71fdf6da97e4cf62d1647add2581b07d79adf8397eb4ecba9c5e8488821423 ------BEGIN CERTIFICATE----- -MIIB/jCCAYWgAwIBAgIIdJclisc/elQwCgYIKoZIzj0EAwMwRTELMAkGA1UEBhMC -VVMxFDASBgNVBAoMC0FmZmlybVRydXN0MSAwHgYDVQQDDBdBZmZpcm1UcnVzdCBQ -cmVtaXVtIEVDQzAeFw0xMDAxMjkxNDIwMjRaFw00MDEyMzExNDIwMjRaMEUxCzAJ -BgNVBAYTAlVTMRQwEgYDVQQKDAtBZmZpcm1UcnVzdDEgMB4GA1UEAwwXQWZmaXJt -VHJ1c3QgUHJlbWl1bSBFQ0MwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAQNMF4bFZ0D -0KF5Nbc6PJJ6yhUczWLznCZcBz3lVPqj1swS6vQUX+iOGasvLkjmrBhDeKzQN8O9 -ss0s5kfiGuZjuD0uL3jET9v0D6RoTFVya5UdThhClXjMNzyR4ptlKymjQjBAMB0G -A1UdDgQWBBSaryl6wBE1NSZRMADDav5A1a7WPDAPBgNVHRMBAf8EBTADAQH/MA4G -A1UdDwEB/wQEAwIBBjAKBggqhkjOPQQDAwNnADBkAjAXCfOHiFBar8jAQr9HX/Vs -aobgxCd05DhT1wV/GzTjxi+zygk8N53X57hG8f2h4nECMEJZh0PUUd+60wkyWs6I -flc9nF9Ca/UHLbXwgpP5WW+uZPpY5Yse42O+tYHNbwKMeQ== ------END CERTIFICATE----- -# CN=AffirmTrust Premium,O=AffirmTrust,C=US -# 70a73f7f376b60074248904534b11482d5bf0e698ecc498df52577ebf2e93b9a ------BEGIN CERTIFICATE----- -MIIFRjCCAy6gAwIBAgIIbYwURrGmCu4wDQYJKoZIhvcNAQEMBQAwQTELMAkGA1UE -BhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MRwwGgYDVQQDDBNBZmZpcm1UcnVz -dCBQcmVtaXVtMB4XDTEwMDEyOTE0MTAzNloXDTQwMTIzMTE0MTAzNlowQTELMAkG -A1UEBhMCVVMxFDASBgNVBAoMC0FmZmlybVRydXN0MRwwGgYDVQQDDBNBZmZpcm1U -cnVzdCBQcmVtaXVtMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEAxBLf -qV/+Qd3d9Z+K4/as4Tx4mrzY8H96oDMq3I0gW64tb+eT2TZwamjPjlGjhVtnBKAQ -JG9dKILBl1fYSCkTtuG+kU3fhQxTGJoeJKJPj/CihQvL9Cl/0qRY7iZNyaqoe5rZ -+jjeRFcV5fiMyNlI4g0WJx0eyIOFJbe6qlVBzAMiSy2RjYvmia9mx+n/K+k8rNrS -s8PhaJyJ+HoAVt70VZVs+7pk3WKL3wt3MutizCaam7uqYoNMtAZ6MMgpv+0GTZe5 -HMQxK9VfvFMSF5yZVylmd2EhMQcuJUmdGPLu8ytxjLW6OQdJd/zvLpKQBY0tL3d7 -70O/Nbua2Plzpyzy0FfuKE4mX4+QaAkvuPjcBukumj5Rp9EixAqnOEhss/n/fauG -V+O61oV4d7pD6kh/9ti+I20ev9E2bFhc8e6kGVQa9QPSdubhjL08s9NIS+LI+H+S -qHZGnEJlPqQewQcDWkYtuJfzt9WyVSHvutxMAJf7FJUnM7/oQ0dG0giZFmA7mn7S -5u046uwBHjxIVkkJx0w3AJ6IDsBz4W9m6XJHMD4Q5QsDyZpCAGzFlH5hxIrff4Ia -C1nEWTJ3s7xgaVY5/bQGeyzWZDbZvUjthB9+pSKPKrhC9IK31FOQeE4tGv2Bb0TX -OwF0lkLgAOIua+rF7nKsu7/+6qqo+Nz2snmKtmcCAwEAAaNCMEAwHQYDVR0OBBYE -FJ3AZ6YMItkm9UWrpmVSESfYRaxjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/ -BAQDAgEGMA0GCSqGSIb3DQEBDAUAA4ICAQCzV00QYk465KzquByvMiPIs0laUZx2 -KI15qldGF9X1Uva3ROgIRL8YhNILgM3FEv0AVQVhh0HctSSePMTYyPtwni94loMg -Nt58D2kTiKV1NpgIpsbfrM7jWNa3Pt668+s0QNiigfV4Py/VpfzZotReBA4Xrf5B -8OWycvpEgjNC6C1Y91aMYj+6QrCcDFx+LmUmXFNPALJ4fqENmS2NuB2OosSw/WDQ -MKSOyARiqcTtNd56l+0OOF6SL5Nwpamcb6d9Ex1+xghIsV5n61EIJenmJWtSKZGc -0jlzCFfemQa0W50QBuHCAKi4HEoCChTQwUHK+4w1IX2COPKpVJEZNZOUbWo6xbLQ -u4mGk+ibyQ86p3q4ofB4Rvr8Ny/lioTz3/4E2aFooC8k4gmVBtWVyuEklut89pMF -u+1z6S3RdTnX5yTb2E5fQ4+e0BQ5v1VwSJlXMbSc7kqYA5YwH2AG7hsj/oFgIxpH -YoWlzBk0gG+zrBrjn/B7SK3VAdlntqlyk+otZrWyuOQ9PLLvTIzq6we/qzWaVYa8 -GKa1qF60g2xraUDTn9zxw2lrueFtCfTxqlB2Cnp9ehehVZZCmTEJ3WARjQUwfuaO -RtGdFNrHF+QFlozEJLUbzxQHskD4o55BhrwE0GuWyCqANP2/7waj3VjFhT0+j/6e -KeC2uAloGRwYQw== ------END CERTIFICATE----- # CN=Amazon Root CA 1,O=Amazon,C=US # 8ecde6884f3d87b1125ba31ac3fcb13d7016de7f57cc904fe1cb97c6ae98196e -----BEGIN CERTIFICATE----- @@ -1385,147 +1293,6 @@ r/OSmbaz5mEP0oUA51Aa5BuVnRmhuZyxm7EAHu/QD09CbMkKvO5D+jpxpchNJqU1 /YldvIViHTLSoCtU7ZpXwdv6EM8Zt4tKG48BtieVU+i2iW1bvGjUI+iLUaJW+fCm gKDWHrO8Dw9TdSmq6hN35N6MgSGtBxBHEa2HPQfRdbzP82Z+ -----END CERTIFICATE----- -# CN=Entrust Root Certification Authority - EC1,OU=See www.entrust.net/legal-terms+OU=(c) 2012 Entrust\, Inc. - for authorized use only,O=Entrust\, Inc.,C=US -# 02ed0eb28c14da45165c566791700d6451d7fb56f0b2ab1d3b8eb070e56edff5 ------BEGIN CERTIFICATE----- -MIIC+TCCAoCgAwIBAgINAKaLeSkAAAAAUNCR+TAKBggqhkjOPQQDAzCBvzELMAkG -A1UEBhMCVVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3 -d3cuZW50cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDEyIEVu -dHJ1c3QsIEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEzMDEGA1UEAxMq -RW50cnVzdCBSb290IENlcnRpZmljYXRpb24gQXV0aG9yaXR5IC0gRUMxMB4XDTEy -MTIxODE1MjUzNloXDTM3MTIxODE1NTUzNlowgb8xCzAJBgNVBAYTAlVTMRYwFAYD -VQQKEw1FbnRydXN0LCBJbmMuMSgwJgYDVQQLEx9TZWUgd3d3LmVudHJ1c3QubmV0 -L2xlZ2FsLXRlcm1zMTkwNwYDVQQLEzAoYykgMjAxMiBFbnRydXN0LCBJbmMuIC0g -Zm9yIGF1dGhvcml6ZWQgdXNlIG9ubHkxMzAxBgNVBAMTKkVudHJ1c3QgUm9vdCBD -ZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEVDMTB2MBAGByqGSM49AgEGBSuBBAAi -A2IABIQTydC6bUF74mzQ61VfZgIaJPRbiWlH47jCffHyAsWfoPZb1YsGGYZPUxBt -ByQnoaD41UcZYUx9ypMn6nQM72+WCf5j7HBdNq1nd67JnXxVRDqiY1Ef9eNi1KlH -Bz7MIKNCMEAwDgYDVR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8wHQYDVR0O -BBYEFLdj5xrdjekIplWDpOBqUEFlEUJJMAoGCCqGSM49BAMDA2cAMGQCMGF52OVC -R98crlOZF7ZvHH3hvxGU0QOIdeSNiaSKd0bebWHvAvX7td/M/k7//qnmpwIwW5nX -hTcGtXsI/esni0qU+eH6p44mCOh8kmhtc9hvJqwhAriZtyZBWyVgrtBIGu4G ------END CERTIFICATE----- -# CN=Entrust Root Certification Authority - G2,OU=See www.entrust.net/legal-terms+OU=(c) 2009 Entrust\, Inc. - for authorized use only,O=Entrust\, Inc.,C=US -# 43df5774b03e7fef5fe40d931a7bedf1bb2e6b42738c4e6d3841103d3aa7f339 ------BEGIN CERTIFICATE----- -MIIEPjCCAyagAwIBAgIESlOMKDANBgkqhkiG9w0BAQsFADCBvjELMAkGA1UEBhMC -VVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50 -cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3Qs -IEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVz -dCBSb290IENlcnRpZmljYXRpb24gQXV0aG9yaXR5IC0gRzIwHhcNMDkwNzA3MTcy -NTU0WhcNMzAxMjA3MTc1NTU0WjCBvjELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVu -dHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwt -dGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0 -aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVzdCBSb290IENlcnRpZmlj -YXRpb24gQXV0aG9yaXR5IC0gRzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK -AoIBAQC6hLZy254Ma+KZ6TABp3bqMriVQRrJ2mFOWHLP/vaCeb9zYQYKpSfYs1/T -RU4cctZOMvJyig/3gxnQaoCAAEUesMfnmr8SVycco2gvCoe9amsOXmXzHHfV1IWN -cCG0szLni6LVhjkCsbjSR87kyUnEO6fe+1R9V77w6G7CebI6C1XiUJgWMhNcL3hW -wcKUs/Ja5CeanyTXxuzQmyWC48zCxEXFjJd6BmsqEZ+pCm5IO2/b1BEZQvePB7/1 -U1+cPvQXLOZprE4yTGJ36rfo5bs0vBmLrpxR57d+tVOxMyLlbc9wPBr64ptntoP0 -jaWvYkxN4FisZDQSA/i2jZRjJKRxAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAP -BgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBRqciZ60B7vfec7aVHUbI2fkBJmqzAN -BgkqhkiG9w0BAQsFAAOCAQEAeZ8dlsa2eT8ijYfThwMEYGprmi5ZiXMRrEPR9RP/ -jTkrwPK9T3CMqS/qF8QLVJ7UG5aYMzyorWKiAHarWWluBh1+xLlEjZivEtRh2woZ -Rkfz6/djwUAFQKXSt/S1mja/qYh2iARVBCuch38aNzx+LaUa2NSJXsq9rD1s2G2v -1fN2D807iDginWyTmsQ9v4IbZT+mD12q/OWyFcq1rca8PdCE6OoGcrBNOTJ4vz4R -nAuknZoh8/CbCzB428Hch0P+vGOaysXCHMnHjf87ElgI5rY97HosTvuDls4MPGmH -VHOkc8KT/1EQrBVUAdj8BbGJoX90g5pJ19xOe4pIb4tF9g== ------END CERTIFICATE----- -# CN=Entrust Root Certification Authority - G4,OU=See www.entrust.net/legal-terms+OU=(c) 2015 Entrust\, Inc. - for authorized use only,O=Entrust\, Inc.,C=US -# db3517d1f6732a2d5ab97c533ec70779ee3270a62fb4ac4238372460e6f01e88 ------BEGIN CERTIFICATE----- -MIIGSzCCBDOgAwIBAgIRANm1Q3+vqTkPAAAAAFVlrVgwDQYJKoZIhvcNAQELBQAw -gb4xCzAJBgNVBAYTAlVTMRYwFAYDVQQKEw1FbnRydXN0LCBJbmMuMSgwJgYDVQQL -Ex9TZWUgd3d3LmVudHJ1c3QubmV0L2xlZ2FsLXRlcm1zMTkwNwYDVQQLEzAoYykg -MjAxNSBFbnRydXN0LCBJbmMuIC0gZm9yIGF1dGhvcml6ZWQgdXNlIG9ubHkxMjAw -BgNVBAMTKUVudHJ1c3QgUm9vdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEc0 -MB4XDTE1MDUyNzExMTExNloXDTM3MTIyNzExNDExNlowgb4xCzAJBgNVBAYTAlVT -MRYwFAYDVQQKEw1FbnRydXN0LCBJbmMuMSgwJgYDVQQLEx9TZWUgd3d3LmVudHJ1 -c3QubmV0L2xlZ2FsLXRlcm1zMTkwNwYDVQQLEzAoYykgMjAxNSBFbnRydXN0LCBJ -bmMuIC0gZm9yIGF1dGhvcml6ZWQgdXNlIG9ubHkxMjAwBgNVBAMTKUVudHJ1c3Qg -Um9vdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEc0MIICIjANBgkqhkiG9w0B -AQEFAAOCAg8AMIICCgKCAgEAsewsQu7i0TD/pZJH4i3DumSXbcr3DbVZwbPLqGgZ -2K+EbTBwXX7zLtJTmeH+H17ZSK9dE43b/2MzTdMAArzE+NEGCJR5WIoV3imz/f3E -T+iq4qA7ec2/a0My3dl0ELn39GjUu9CH1apLiipvKgS1sqbHoHrmSKvS0VnM1n4j -5pds8ELl3FFLFUHtSUrJ3hCX1nbB76W1NhSXNdh4IjVS70O92yfbYVaCNNzLiGAM -C1rlLAHGVK/XqsEQe9IFWrhAnoanw5CGAlZSCXqc0ieCU0plUmr1POeo8pyvi73T -DtTUXm6Hnmo9RR3RXRv06QqsYJn7ibT/mCzPfB3pAqoEmh643IhuJbNsZvc8kPNX -wbMv9W3y+8qh+CmdRouzavbmZwe+LGcKKh9asj5XxNMhIWNlUpEbsZmOeX7m640A -2Vqq6nPopIICR5b+W45UYaPrL0swsIsjdXJ8ITzI9vF01Bx7owVV7rtNOzK+mndm -nqxpkCIHH2E6lr7lmk/MBTwoWdPBDFSoWWG9yHJM6Nyfh3+9nEg2XpWjDrk4JFX8 -dWbrAuMINClKxuMrLzOg2qOGpRKX/YAr2hRC45K9PvJdXmd0LhyIRyk0X+IyqJwl -N4y6mACXi0mWHv0liqzc2thddG5msP9E36EYxr5ILzeUePiVSj9/E15dWf10hkNj -c0kCAwEAAaNCMEAwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwHQYD -VR0OBBYEFJ84xFYjwznooHFs6FRM5Og6sb9nMA0GCSqGSIb3DQEBCwUAA4ICAQAS -5UKme4sPDORGpbZgQIeMJX6tuGguW8ZAdjwD+MlZ9POrYs4QjbRaZIxowLByQzTS -Gwv2LFPSypBLhmb8qoMi9IsabyZIrHZ3CL/FmFz0Jomee8O5ZDIBf9PD3Vht7LGr -hFV0d4QEJ1JrhkzO3bll/9bGXp+aEJlLdWr+aumXIOTkdnrG0CSqkM0gkLpHZPt/ -B7NTeLUKYvJzQ85BK4FqLoUWlFPUa19yIqtRLULVAJyZv967lDtX/Zr1hstWO1uI -AeV8KEsD+UmDfLJ/fOPtjqF/YFOOVZ1QNBIPt5d7bIdKROf1beyAN/BYGW5KaHbw -H5Lk6rWS02FREAutp9lfx1/cH6NcjKF+m7ee01ZvZl4HliDtC3T7Zk6LERXpgUl+ -b7DUUH8i119lAg2m9IUe2K4GS0qn0jFmwvjO5QimpAKWRGhXxNUzzxkvFMSUHHuk -2fCfDrGA4tGeEWSpiBE6doLlYsKA2KSD7ZPvfC+QsDJMlhVoSFLUmQjAJOgc47Ol -IQ6SwJAfzyBfyjs4x7dtOvPmRLgOMWuIjnDrnBdSqEGULoe256YSxXXfW8AKbnuk -5F6G+TaU33fD6Q3AOfF5u0aOq0NZJ7cguyPpVkAh7DE9ZapD8j3fcEThuk0mEDuY -n/PIjhs4ViFqUZPTkcpG2om3PVODLAgfi49T3f+sHw== ------END CERTIFICATE----- -# CN=Entrust Root Certification Authority,OU=www.entrust.net/CPS is incorporated by reference+OU=(c) 2006 Entrust\, Inc.,O=Entrust\, Inc.,C=US -# 73c176434f1bc6d5adf45b0e76e727287c8de57616c1e6e6141a2b2cbc7d8e4c ------BEGIN CERTIFICATE----- -MIIEkTCCA3mgAwIBAgIERWtQVDANBgkqhkiG9w0BAQUFADCBsDELMAkGA1UEBhMC -VVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xOTA3BgNVBAsTMHd3dy5lbnRydXN0 -Lm5ldC9DUFMgaXMgaW5jb3Jwb3JhdGVkIGJ5IHJlZmVyZW5jZTEfMB0GA1UECxMW -KGMpIDIwMDYgRW50cnVzdCwgSW5jLjEtMCsGA1UEAxMkRW50cnVzdCBSb290IENl -cnRpZmljYXRpb24gQXV0aG9yaXR5MB4XDTA2MTEyNzIwMjM0MloXDTI2MTEyNzIw -NTM0MlowgbAxCzAJBgNVBAYTAlVTMRYwFAYDVQQKEw1FbnRydXN0LCBJbmMuMTkw -NwYDVQQLEzB3d3cuZW50cnVzdC5uZXQvQ1BTIGlzIGluY29ycG9yYXRlZCBieSBy -ZWZlcmVuY2UxHzAdBgNVBAsTFihjKSAyMDA2IEVudHJ1c3QsIEluYy4xLTArBgNV -BAMTJEVudHJ1c3QgUm9vdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTCCASIwDQYJ -KoZIhvcNAQEBBQADggEPADCCAQoCggEBALaVtkNC+sZtKm9I35RMOVcF7sN5EUFo -Nu3s/poBj6E4KPz3EEZmLk0eGrEaTsbRwJWIsMn/MYszA9u3g3s+IIRe7bJWKKf4 -4LlAcTfFy0cOlypowCKVYhXbR9n10Cv/gkvJrT7eTNuQgFA/CYqEAOwwCj0Yzfv9 -KlmaI5UXLEWeH25DeW0MXJj+SKfFI0dcXv1u5x609mhF0YaDW6KKjbHjKYD+JXGI -rb68j6xSlkuqUY3kEzEZ6E5Nn9uss2rVvDlUccp6en+Q3X0dgNmBu1kmwhH+5pPi -94DkZfs0Nw4pgHBNrziGLp5/V6+eF67rHMsoIV+2HNjnogQi+dPa2MsCAwEAAaOB -sDCBrTAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zArBgNVHRAEJDAi -gA8yMDA2MTEyNzIwMjM0MlqBDzIwMjYxMTI3MjA1MzQyWjAfBgNVHSMEGDAWgBRo -kORnpKZTgMeGZqTx90tD+4S9bTAdBgNVHQ4EFgQUaJDkZ6SmU4DHhmak8fdLQ/uE -vW0wHQYJKoZIhvZ9B0EABBAwDhsIVjcuMTo0LjADAgSQMA0GCSqGSIb3DQEBBQUA -A4IBAQCT1DCw1wMgKtD5Y+iRDAUgqV8ZyntyTtSx29CW+1RaGSwMCPeyvIWonX9t -O1KzKtvn1ISMY/YPyyYBkVBs9F8U4pN0wBOeMDpQ47RgxRzwIkSNcUesyBrJ6Zua -AGAT/3B+XxFNSRuzFVJ7yVTav52Vr2ua2J7p8eRDjeIRRDq/r72DQnNSi6q7pynP -9WQcCk3RvKqsnyrQ/39/2n3qse0wJcGE2jTSW3iDVuycNsMm4hH2Z0kdkquM++v/ -eu6FSqdQgPCnXEqULl8FmTxSQeDNtGPPAUO6nIPcj2A781q0tHuu2guQOHXvgR1m -0vdXcDazv/wor3ElhVsT/h5/WrQ8 ------END CERTIFICATE----- -# CN=Entrust.net Certification Authority (2048),OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.)+OU=(c) 1999 Entrust.net Limited,O=Entrust.net -# 6dc47172e01cbcb0bf62580d895fe2b8ac9ad4f873801e0c10b9c837d21eb177 ------BEGIN CERTIFICATE----- -MIIEKjCCAxKgAwIBAgIEOGPe+DANBgkqhkiG9w0BAQUFADCBtDEUMBIGA1UEChML -RW50cnVzdC5uZXQxQDA+BgNVBAsUN3d3dy5lbnRydXN0Lm5ldC9DUFNfMjA0OCBp -bmNvcnAuIGJ5IHJlZi4gKGxpbWl0cyBsaWFiLikxJTAjBgNVBAsTHChjKSAxOTk5 -IEVudHJ1c3QubmV0IExpbWl0ZWQxMzAxBgNVBAMTKkVudHJ1c3QubmV0IENlcnRp -ZmljYXRpb24gQXV0aG9yaXR5ICgyMDQ4KTAeFw05OTEyMjQxNzUwNTFaFw0yOTA3 -MjQxNDE1MTJaMIG0MRQwEgYDVQQKEwtFbnRydXN0Lm5ldDFAMD4GA1UECxQ3d3d3 -LmVudHJ1c3QubmV0L0NQU18yMDQ4IGluY29ycC4gYnkgcmVmLiAobGltaXRzIGxp -YWIuKTElMCMGA1UECxMcKGMpIDE5OTkgRW50cnVzdC5uZXQgTGltaXRlZDEzMDEG -A1UEAxMqRW50cnVzdC5uZXQgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkgKDIwNDgp -MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArU1LqRKGsuqjIAcVFmQq -K0vRvwtKTY7tgHalZ7d4QMBzQshowNtTK91euHaYNZOLGp18EzoOH1u3Hs/lJBQe -sYGpjX24zGtLA/ECDNyrpUAkAH90lKGdCCmziAv1h3edVc3kw37XamSrhRSGlVuX -MlBvPci6Zgzj/L24ScF2iUkZ/cCovYmjZy/Gn7xxGWC4LeksyZB2ZnuU4q941mVT -XTzWnLLPKQP5L6RQstRIzgUyVYr9smRMDuSYB3Xbf9+5CFVghTAp+XtIpGmG4zU/ -HoZdenoVve8AjhUiVBcAkCaTvA5JaJG/+EfTnZVCwQ5N328mz8MYIWJmQ3DW1cAH -4QIDAQABo0IwQDAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNV -HQ4EFgQUVeSB0RGAvtiJuQijMfmhJAkWuXAwDQYJKoZIhvcNAQEFBQADggEBADub -j1abMOdTmXx6eadNl9cZlZD7Bh/KM3xGY4+WZiT6QBshJ8rmcnPyT/4xmf3IDExo -U8aAghOY+rat2l098c5u9hURlIIM7j+VrxGrD9cv3h8Dj1csHsm7mhpElesYT6Yf -zX1XEC+bBAlahLVu2B064dae0Wx5XnkcFMXj0EyTO2U87d89vqbllRrDtRnDvV5b -u/8j72gZyxKTJ1wDLW8w0B62GqzeWvfRqqgnpv55gcR5mTNXuhKwqeBCbJPKVt7+ -bYQLCIt+jerXmCHG8+c8eS9enNFMFY3h7CI3zJpDC5fcgJCNs2ebb0gIFVbPv/Er -fF6adulZkMV8gzURZVE= ------END CERTIFICATE----- # CN=FIRMAPROFESIONAL CA ROOT-A WEB,O=Firmaprofesional SA,C=ES,2.5.4.97=#130f56415445532d413632363334303638 # bef256daf26e9c69bdec1602359798f3caf71821a03e018257c53c65617f3d4a -----BEGIN CERTIFICATE----- @@ -2867,29 +2634,6 @@ Af8wDgYDVR0PAQH/BAQDAgEGMB0GA1UdDgQWBBTrQciu/NWeUUj1vYv0hyCTQSvT 4P9mLQlO4E/0BdGF9jVg3PVys0Z9AjBEmEYagoUeYWmJSwdLZrWeqrqgHkHZAXQ6 bkU6iYAZezKYVWOr62Nuk22rGwlgMU4= -----END CERTIFICATE----- -# CN=SecureSign RootCA11,O=Japan Certification Services\, Inc.,C=JP -# bf0feefb9e3a581ad5f9e9db7589985743d261085c4d314f6f5d7259aa421612 ------BEGIN CERTIFICATE----- -MIIDbTCCAlWgAwIBAgIBATANBgkqhkiG9w0BAQUFADBYMQswCQYDVQQGEwJKUDEr -MCkGA1UEChMiSmFwYW4gQ2VydGlmaWNhdGlvbiBTZXJ2aWNlcywgSW5jLjEcMBoG -A1UEAxMTU2VjdXJlU2lnbiBSb290Q0ExMTAeFw0wOTA0MDgwNDU2NDdaFw0yOTA0 -MDgwNDU2NDdaMFgxCzAJBgNVBAYTAkpQMSswKQYDVQQKEyJKYXBhbiBDZXJ0aWZp -Y2F0aW9uIFNlcnZpY2VzLCBJbmMuMRwwGgYDVQQDExNTZWN1cmVTaWduIFJvb3RD -QTExMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA/XeqpRyQBTvLTJsz -i1oURaTnkBbR31fSIRCkF/3frNYfp+TbfPfs37gD2pRY/V1yfIw/XwFndBWW4wI8 -h9uuywGOwvNmxoVF9ALGOrVisq/6nL+k5tSAMJjzDbaTj6nU2DbysPyKyiyhFTOV -MdrAG/LuYpmGYz+/3ZMqg6h2uRMft85OQoWPIucuGvKVCbIFtUROd6EgvanyTgp9 -UK31BQ1FT0Zx/Sg+U/sE2C3XZR1KG/rPO7AxmjVuyIsG0wCR8pQIZUyxNAYAeoni -8McDWc/V1uinMrPmmECGxc0nEovMe863ETxiYAcjPitAbpSACW22s293bzUIUPsC -h8U+iQIDAQABo0IwQDAdBgNVHQ4EFgQUW/hNT7KlhtQ60vFjmqC+CfZXt94wDgYD -VR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEB -AKChOBZmLqdWHyGcBvod7bkixTgm2E5P7KN/ed5GIaGHd48HCJqypMWvDzKYC3xm -KbabfSVSSUOrTC4rbnpwrxYO4wJs+0LmGJ1F2FXI6Dvd5+H0LgscNFxsWEr7jIhQ -X5Ucv+2rIrVls4W6ng+4reV6G4pQOh29Dbx7VFALuUKvVaAYga1lme++5Jy/xIWr -QbJUb9wlze144o4MjQlJ3WN7WmmWAiGovVJZ6X01y8hSyn+B/tlr0/cR7SXf+Of5 -pPpyl4RTDaXQMhhRdlkUbA/r7F+AjHVDg8OFmP9Mni0N5HeDk061lgeLKBObjBmN -QSdJQO7e5iNEOdyhIta6A/I= ------END CERTIFICATE----- # CN=SecureTrust CA,O=SecureTrust Corporation,C=US # f1c1b50ae5a20dd8030ec9f6bc24823dd367b5255759b4e71b61fce9f7375d73 -----BEGIN CERTIFICATE----- @@ -2930,40 +2674,6 @@ BAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAKBggqhkjOPQQDAwNoADBlAjAVXUI9/Lbu 9zuxNuie9sRGKEkz0FhDKmMpzE2xtHqiuQ04pV1IKv3LsnNdo4gIxwwCMQDAqy0O be0YottT6SXbVQjgUMzfRGEWgqtJsLKB7HOHeLRMsmIbEvoWTSVLY70eN9k= -----END CERTIFICATE----- -# CN=Security Communication RootCA3,O=SECOM Trust Systems CO.\,LTD.,C=JP -# 24a55c2ab051442d0617766541239a4ad032d7c55175aa34ffde2fbc4f5c5294 ------BEGIN CERTIFICATE----- -MIIFfzCCA2egAwIBAgIJAOF8N0D9G/5nMA0GCSqGSIb3DQEBDAUAMF0xCzAJBgNV -BAYTAkpQMSUwIwYDVQQKExxTRUNPTSBUcnVzdCBTeXN0ZW1zIENPLixMVEQuMScw -JQYDVQQDEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTMwHhcNMTYwNjE2 -MDYxNzE2WhcNMzgwMTE4MDYxNzE2WjBdMQswCQYDVQQGEwJKUDElMCMGA1UEChMc -U0VDT00gVHJ1c3QgU3lzdGVtcyBDTy4sTFRELjEnMCUGA1UEAxMeU2VjdXJpdHkg -Q29tbXVuaWNhdGlvbiBSb290Q0EzMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIIC -CgKCAgEA48lySfcw3gl8qUCBWNO0Ot26YQ+TUG5pPDXC7ltzkBtnTCHsXzW7OT4r -CmDvu20rhvtxosis5FaU+cmvsXLUIKx00rgVrVH+hXShuRD+BYD5UpOzQD11EKzA -lrenfna84xtSGc4RHwsENPXY9Wk8d/Nk9A2qhd7gCVAEF5aEt8iKvE1y/By7z/MG -TfmfZPd+pmaGNXHIEYBMwXFAWB6+oHP2/D5Q4eAvJj1+XCO1eXDe+uDRpdYMQXF7 -9+qMHIjH7Iv10S9VlkZ8WjtYO/u62C21Jdp6Ts9EriGmnpjKIG58u4iFW/vAEGK7 -8vknR+/RiTlDxN/e4UG/VHMgly1s2vPUB6PmudhvrvyMGS7TZ2crldtYXLVqAvO4 -g160a75BflcJdURQVc1aEWEhCmHCqYj9E7wtiS/NYeCVvsq1e+F7NGcLH7YMx3we -GVPKp7FKFSBWFHA9K4IsD50VHUeAR/94mQ4xr28+j+2GaR57GIgUssL8gjMunEst -+3A7caoreyYn8xrC3PsXuKHqy6C0rtOUfnrQq8PsOC0RLoi/1D+tEjtCrI8Cbn3M -0V9hvqG8OmpI6iZVIhZdXw3/JzOfGAN0iltSIEdrRU0id4xVJ/CvHozJgyJUt5rQ -T9nO/NkuHJYosQLTA70lUhw0Zk8jq/R3gpYd0VcwCBEF/VfR2ccCAwEAAaNCMEAw -HQYDVR0OBBYEFGQUfPxYchamCik0FW8qy7z8r6irMA4GA1UdDwEB/wQEAwIBBjAP -BgNVHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBDAUAA4ICAQDcAiMI4u8hOscNtybS -YpOnpSNyByCCYN8Y11StaSWSntkUz5m5UoHPrmyKO1o5yGwBQ8IibQLwYs1OY0PA -FNr0Y/Dq9HHuTofjcan0yVflLl8cebsjqodEV+m9NU1Bu0soo5iyG9kLFwfl9+qd -9XbXv8S2gVj/yP9kaWJ5rW4OH3/uHWnlt3Jxs/6lATWUVCvAUm2PVcTJ0rjLyjQI -UYWg9by0F1jqClx6vWPGOi//lkkZhOpn2ASxYfQAW0q3nHE3GYV5v4GwxxMOdnE+ -OoAGrgYWp421wsTL/0ClXI2lyTrtcoHKXJg80jQDdwj98ClZXSEIx2C/pHF7uNke -gr4Jr2VvKKu/S7XuPghHJ6APbw+LP6yVGPO5DtxnVW5inkYO0QR4ynKudtml+LLf -iAlhi+8kTtFZP1rUPcmTPCtk9YENFpb3ksP+MW/oKjJ0DvRMmEoYDjBU1cXrvMUV -nuiZIesnKwkK2/HmcBhWuwzkvvnoEKQTkrgc4NtnHVMDpCKn3F2SEDzq//wbEBrD -2NCcnWXL0CsnMQMeNuE9dnUM/0Umud1RvCPHX9jYhxBAEg09ODfnRDwYwFMJZI// -1ZqmfHAuc1Uh6N//g7kdPjIe1qZ9LPFm6Vwdp6POXiUyK+OVrCoHzrQoeIY8Laad -TdJ0MN1kURXbg4NR16/9M51NZg== ------END CERTIFICATE----- # CN=Starfield Root Certificate Authority - G2,O=Starfield Technologies\, Inc.,L=Scottsdale,ST=Arizona,C=US # 2ce1cb0bf9d2f9e102993fbe215152c3b2dd0cabde1c68e5319b839154dbb7f5 -----BEGIN CERTIFICATE----- From 3e90321ac7bcee3d924ed63ed3ad97be2079cb56 Mon Sep 17 00:00:00 2001 From: Gopher Robot Date: Wed, 4 Dec 2024 16:27:45 +0000 Subject: [PATCH 55/57] go.mod: update golang.org/x dependencies Update golang.org/x dependencies to their latest tagged versions. Change-Id: I580d412fc4a135696d4054f8007593cfa4f64224 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/633480 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Gopher Robot --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index d4bb2e13f3..4a13082a28 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.20 require ( golang.org/x/net v0.21.0 // tagx:ignore - golang.org/x/sys v0.27.0 - golang.org/x/term v0.26.0 + golang.org/x/sys v0.28.0 + golang.org/x/term v0.27.0 ) -require golang.org/x/text v0.20.0 // indirect +require golang.org/x/text v0.21.0 // indirect diff --git a/go.sum b/go.sum index b626a3d76d..41808cb739 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= -golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= -golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= -golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= -golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= From 7042ebcbe097f305ba3a93f9a22b4befa4b83d29 Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Wed, 4 Dec 2024 10:46:07 -0800 Subject: [PATCH 56/57] openpgp/clearsign: just use rand.Reader in tests Instead of a convoluted fake rand, it is _basically_ just as fast, and fixes errors that pop up due to bad entropy. Fixes golang/go#70682 Change-Id: Ib0f605398d1092b516b03135f602c644be2a060f Reviewed-on: https://go-review.googlesource.com/c/crypto/+/633655 Reviewed-by: Tatiana Bradley LUCI-TryBot-Result: Go LUCI Auto-Submit: Roland Shoemaker Reviewed-by: Filippo Valsorda --- openpgp/clearsign/clearsign_test.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/openpgp/clearsign/clearsign_test.go b/openpgp/clearsign/clearsign_test.go index 051b8f16ff..821c35ff47 100644 --- a/openpgp/clearsign/clearsign_test.go +++ b/openpgp/clearsign/clearsign_test.go @@ -123,24 +123,12 @@ func TestSigning(t *testing.T) { } } -// We use this to make test keys, so that they aren't all the same. -type quickRand byte - -func (qr *quickRand) Read(p []byte) (int, error) { - for i := range p { - p[i] = byte(*qr) - } - *qr++ - return len(p), nil -} - func TestMultiSign(t *testing.T) { if testing.Short() { t.Skip("skipping long test in -short mode") } - zero := quickRand(0) - config := packet.Config{Rand: &zero} + var config packet.Config for nKeys := 0; nKeys < 4; nKeys++ { nextTest: From b4f1988a35dee11ec3e05d6bf3e90b695fbd8909 Mon Sep 17 00:00:00 2001 From: Roland Shoemaker Date: Tue, 3 Dec 2024 09:03:03 -0800 Subject: [PATCH 57/57] ssh: make the public key cache a 1-entry FIFO cache Users of the the ssh package seem to extremely commonly misuse the PublicKeyCallback API, assuming that the key passed in the last call before a connection is established is the key used for authentication. Some users then make authorization decisions based on this key. This property is not documented, and may not be correct, due to the caching behavior of the package, resulting in users making incorrect authorization decisions about the connection. This change makes the cache a one entry FIFO cache, making the assumed property, that the last call to PublicKeyCallback represents the key actually used for authentication, actually hold. Thanks to Damien Tournoud, Patrick Dawkins, Vince Parker, and Jules Duvivier from the Platform.sh / Upsun engineering team for reporting this issue. Fixes golang/go#70779 Fixes CVE-2024-45337 Change-Id: Ife7c7b4045d8b6bcd7e3a417bdfae370c709797f Reviewed-on: https://go-review.googlesource.com/c/crypto/+/635315 Reviewed-by: Roland Shoemaker Auto-Submit: Gopher Robot Reviewed-by: Damien Neil Reviewed-by: Nicola Murino LUCI-TryBot-Result: Go LUCI --- ssh/server.go | 15 ++++++++++---- ssh/server_test.go | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/ssh/server.go b/ssh/server.go index c0d1c29e6f..5b5ccd96f4 100644 --- a/ssh/server.go +++ b/ssh/server.go @@ -149,7 +149,7 @@ func (s *ServerConfig) AddHostKey(key Signer) { } // cachedPubKey contains the results of querying whether a public key is -// acceptable for a user. +// acceptable for a user. This is a FIFO cache. type cachedPubKey struct { user string pubKeyData []byte @@ -157,7 +157,13 @@ type cachedPubKey struct { perms *Permissions } -const maxCachedPubKeys = 16 +// maxCachedPubKeys is the number of cache entries we store. +// +// Due to consistent misuse of the PublicKeyCallback API, we have reduced this +// to 1, such that the only key in the cache is the most recently seen one. This +// forces the behavior that the last call to PublicKeyCallback will always be +// with the key that is used for authentication. +const maxCachedPubKeys = 1 // pubKeyCache caches tests for public keys. Since SSH clients // will query whether a public key is acceptable before attempting to @@ -179,9 +185,10 @@ func (c *pubKeyCache) get(user string, pubKeyData []byte) (cachedPubKey, bool) { // add adds the given tuple to the cache. func (c *pubKeyCache) add(candidate cachedPubKey) { - if len(c.keys) < maxCachedPubKeys { - c.keys = append(c.keys, candidate) + if len(c.keys) >= maxCachedPubKeys { + c.keys = c.keys[1:] } + c.keys = append(c.keys, candidate) } // ServerConn is an authenticated SSH connection, as seen from the diff --git a/ssh/server_test.go b/ssh/server_test.go index b6d8ab3333..ba1bd10e82 100644 --- a/ssh/server_test.go +++ b/ssh/server_test.go @@ -5,6 +5,7 @@ package ssh import ( + "bytes" "errors" "fmt" "io" @@ -299,6 +300,54 @@ func TestBannerError(t *testing.T) { } } +func TestPublicKeyCallbackLastSeen(t *testing.T) { + var lastSeenKey PublicKey + + c1, c2, err := netPipe() + if err != nil { + t.Fatalf("netPipe: %v", err) + } + defer c1.Close() + defer c2.Close() + serverConf := &ServerConfig{ + PublicKeyCallback: func(conn ConnMetadata, key PublicKey) (*Permissions, error) { + lastSeenKey = key + fmt.Printf("seen %#v\n", key) + if _, ok := key.(*dsaPublicKey); !ok { + return nil, errors.New("nope") + } + return nil, nil + }, + } + serverConf.AddHostKey(testSigners["ecdsap256"]) + + done := make(chan struct{}) + go func() { + defer close(done) + NewServerConn(c1, serverConf) + }() + + clientConf := ClientConfig{ + User: "user", + Auth: []AuthMethod{ + PublicKeys(testSigners["rsa"], testSigners["dsa"], testSigners["ed25519"]), + }, + HostKeyCallback: InsecureIgnoreHostKey(), + } + + _, _, _, err = NewClientConn(c2, "", &clientConf) + if err != nil { + t.Fatal(err) + } + <-done + + expectedPublicKey := testSigners["dsa"].PublicKey().Marshal() + lastSeenMarshalled := lastSeenKey.Marshal() + if !bytes.Equal(lastSeenMarshalled, expectedPublicKey) { + t.Errorf("unexpected key: got %#v, want %#v", lastSeenKey, testSigners["dsa"].PublicKey()) + } +} + type markerConn struct { closed uint32 used uint32