From e7edc7e27e13c789b56768b556dfcde767920f10 Mon Sep 17 00:00:00 2001 From: Adam Langley Date: Mon, 17 Oct 2016 13:35:27 -0700 Subject: [PATCH] vendor: update golang.org/x/crypto/chacha20poly1305 This change updates the vendored chacha20poly1305 package to match revision 14f9af67c679edd414f72f13d67c917447113df2 of x/crypto. Change-Id: I05a4ba86578b0f0cdb1ed7dd50fee3b38bb48cf5 Reviewed-on: https://go-review.googlesource.com/31312 Run-TryBot: Adam Langley Reviewed-by: Brad Fitzpatrick TryBot-Result: Gobot Gobot --- .../chacha20poly1305/chacha20poly1305_amd64.s | 420 +++++++++--------- 1 file changed, 210 insertions(+), 210 deletions(-) diff --git a/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s index a1812b70d8..ac9584481d 100644 --- a/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s +++ b/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -82,82 +82,82 @@ #define TT2 BB3 #define TT3 CC3 // ChaCha20 constants -DATA chacha20Constants<>+0x00(SB)/4, $0x61707865 -DATA chacha20Constants<>+0x04(SB)/4, $0x3320646e -DATA chacha20Constants<>+0x08(SB)/4, $0x79622d32 -DATA chacha20Constants<>+0x0c(SB)/4, $0x6b206574 -DATA chacha20Constants<>+0x10(SB)/4, $0x61707865 -DATA chacha20Constants<>+0x14(SB)/4, $0x3320646e -DATA chacha20Constants<>+0x18(SB)/4, $0x79622d32 -DATA chacha20Constants<>+0x1c(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 // <<< 16 with PSHUFB -DATA rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA rol16<>+0x10(SB)/8, $0x0504070601000302 -DATA rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A +DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A // <<< 8 with PSHUFB -DATA rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA rol8<>+0x10(SB)/8, $0x0605040702010003 -DATA rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B - -DATA avx2InitMask<>+0x00(SB)/8, $0x0 -DATA avx2InitMask<>+0x08(SB)/8, $0x0 -DATA avx2InitMask<>+0x10(SB)/8, $0x1 -DATA avx2InitMask<>+0x18(SB)/8, $0x0 - -DATA avx2IncMask<>+0x00(SB)/8, $0x2 -DATA avx2IncMask<>+0x08(SB)/8, $0x0 -DATA avx2IncMask<>+0x10(SB)/8, $0x2 -DATA avx2IncMask<>+0x18(SB)/8, $0x0 +DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B + +DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 +DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 + +DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 +DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 // Poly1305 key clamp -DATA polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -DATA polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF +DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF +DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA sseIncMask<>+0x00(SB)/8, $0x1 -DATA sseIncMask<>+0x08(SB)/8, $0x0 +DATA ·sseIncMask<>+0x00(SB)/8, $0x1 +DATA ·sseIncMask<>+0x08(SB)/8, $0x0 // To load/store the last < 16 bytes in a buffer -DATA andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL chacha20Constants<>(SB), (NOPTR+RODATA), $32 -GLOBL rol16<>(SB), (NOPTR+RODATA), $32 -GLOBL rol8<>(SB), (NOPTR+RODATA), $32 -GLOBL sseIncMask<>(SB), (NOPTR+RODATA), $16 -GLOBL avx2IncMask<>(SB), (NOPTR+RODATA), $32 -GLOBL avx2InitMask<>(SB), (NOPTR+RODATA), $32 -GLOBL polyClampMask<>(SB), (NOPTR+RODATA), $32 -GLOBL andMask<>(SB), (NOPTR+RODATA), $240 +DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff + +GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 +GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 +GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 // No PALIGNR in Go ASM yet (but VPALIGNR is present). #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 @@ -185,15 +185,15 @@ GLOBL andMask<>(SB), (NOPTR+RODATA), $240 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 // Some macros #define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; PSHUFB rol16<>(SB), D \ + PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; PSHUFB rol8<>(SB), D \ + PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B #define chachaQR_AVX2(A, B, C, D, T) \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol16<>(SB), D, D \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol8<>(SB), D, D \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 @@ -286,7 +286,7 @@ TEXT ·chacha20Poly1305Open(SB), 0, $288-97 JBE openSSE128 // About 16% faster // For long buffers, prepare the poly key first - MOVOU chacha20Constants<>(SB), A0 + MOVOU ·chacha20Constants<>(SB), A0 MOVOU (1*16)(keyp), B0 MOVOU (2*16)(keyp), C0 MOVOU (3*16)(keyp), D0 @@ -307,10 +307,10 @@ openSSEPreparePolyKey: JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0 + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 // Clamp and store the key - PAND polyClampMask<>(SB), A0 + PAND ·polyClampMask<>(SB), A0 MOVO A0, rStore; MOVO B0, sStore // Hash AAD @@ -322,10 +322,10 @@ openSSEMainLoop: JB openSSEMainLoopDone // Load state, increment counter blocks - MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store @@ -370,7 +370,7 @@ openSSEInternalLoop: JG openSSEInternalLoop // Add in the state - PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 @@ -446,9 +446,9 @@ openSSEFinalize: // Special optimization for buffers smaller than 129 bytes openSSE128: // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 MOVQ $10, itr2 @@ -465,13 +465,13 @@ openSSE128InnerCipherLoop: JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 // Clamp and store the key - PAND polyClampMask<>(SB), A0 + PAND ·polyClampMask<>(SB), A0 MOVOU A0, rStore; MOVOU B0, sStore // Hash @@ -509,7 +509,7 @@ openSSETail16: // We can safely load the CT from the end, because it is padded with the MAC MOVQ inl, itr2 SHLQ $4, itr2 - LEAQ andMask<>(SB), t0 + LEAQ ·andMask<>(SB), t0 MOVOU (inp), T0 ADDQ inl, inp PAND -16(t0)(itr2*1), T0 @@ -534,7 +534,7 @@ openSSETail16Store: // Special optimization for the last 64 bytes of ciphertext openSSETail64: // Need to decrypt up to 64 bytes - prepare single block - MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store XORQ itr2, itr2 MOVQ inl, itr1 CMPQ itr1, $16 @@ -559,7 +559,7 @@ openSSETail64LoopB: CMPQ itr2, $160 JNE openSSETail64LoopB - PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 openSSETail64DecLoop: CMPQ inl, $16 @@ -583,8 +583,8 @@ openSSETail64DecLoopDone: // Special optimization for the last 128 bytes of ciphertext openSSETail128: // Need to decrypt up to 128 bytes - prepare two blocks - MOVO chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr0Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr1Store + MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store XORQ itr2, itr2 MOVQ inl, itr1 ANDQ $-16, itr1 @@ -609,7 +609,7 @@ openSSETail128LoopB: CMPQ itr2, $160 JNE openSSETail128LoopB - PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1 + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 PADDL state1Store, B0; PADDL state1Store, B1 PADDL state2Store, C0; PADDL state2Store, C1 PADDL ctr1Store, D0; PADDL ctr0Store, D1 @@ -627,9 +627,9 @@ openSSETail128LoopB: // Special optimization for the last 192 bytes of ciphertext openSSETail192: // Need to decrypt up to 192 bytes - prepare three blocks - MOVO chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr0Store - MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr2Store + MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store + MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store MOVQ inl, itr1 MOVQ $160, itr2 @@ -674,7 +674,7 @@ openSSLTail192LoopB: polyMul openSSLTail192Store: - PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 @@ -696,10 +696,10 @@ openSSLTail192Store: // Special optimization for the last 256 bytes of ciphertext openSSETail256: // Need to decrypt up to 256 bytes - prepare four blocks - MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store @@ -744,7 +744,7 @@ openSSETail256HashLoop: JB openSSETail256HashLoop // Add in the state - PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 @@ -779,11 +779,11 @@ openSSETail256HashLoop: // ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Open_AVX2: VZEROUPPER - VMOVDQU chacha20Constants<>(SB), AA0 + VMOVDQU ·chacha20Constants<>(SB), AA0 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD avx2InitMask<>(SB), DD0, DD0 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 // Special optimization, for very short buffers CMPQ inl, $192 @@ -805,7 +805,7 @@ openAVX2PreparePolyKey: DECQ itr2 JNE openAVX2PreparePolyKey - VPADDD chacha20Constants<>(SB), AA0, AA0 + VPADDD ·chacha20Constants<>(SB), AA0, AA0 VPADDD state1StoreAVX2, BB0, BB0 VPADDD state2StoreAVX2, CC0, CC0 VPADDD ctr3StoreAVX2, DD0, DD0 @@ -813,7 +813,7 @@ openAVX2PreparePolyKey: VPERM2I128 $0x02, AA0, BB0, TT0 // Clamp and store poly key - VPAND polyClampMask<>(SB), TT0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for the first 64 bytes @@ -846,10 +846,10 @@ openAVX2MainLoop: JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters - VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 XORQ itr1, itr1 @@ -860,7 +860,7 @@ openAVX2InternalLoop: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage1_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulStage2_AVX2 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 @@ -874,7 +874,7 @@ openAVX2InternalLoop: polyMulReduceStage VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 polyAdd(2*8(inp)(itr1*1)) VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage1_AVX2 @@ -892,7 +892,7 @@ openAVX2InternalLoop: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage3_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulReduceStage VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 @@ -908,7 +908,7 @@ openAVX2InternalLoop: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 polyMulStage2_AVX2 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage3_AVX2 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 @@ -925,7 +925,7 @@ openAVX2InternalLoop: CMPQ itr1, $480 JNE openAVX2InternalLoop - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 @@ -974,7 +974,7 @@ openAVX2192: VMOVDQA AA0, AA1 VMOVDQA BB0, BB1 VMOVDQA CC0, CC1 - VPADDD avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA2 VMOVDQA BB0, BB2 VMOVDQA CC0, CC2 @@ -1000,7 +1000,7 @@ openAVX2192InnerCipherLoop: VPERM2I128 $0x02, AA0, BB0, TT0 // Clamp and store poly key - VPAND polyClampMask<>(SB), TT0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 192 bytes @@ -1072,8 +1072,8 @@ openAVX2ShortDone: // Special optimization for buffers smaller than 321 bytes openAVX2320: // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2 + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 MOVQ $10, itr2 @@ -1089,18 +1089,18 @@ openAVX2320InnerCipherLoop: DECQ itr2 JNE openAVX2320InnerCipherLoop - VMOVDQA chacha20Constants<>(SB), TT0 + VMOVDQA ·chacha20Constants<>(SB), TT0 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA avx2IncMask<>(SB), TT0 + VMOVDQA ·avx2IncMask<>(SB), TT0 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 VPADDD TT3, DD2, DD2 // Clamp and store poly key VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND polyClampMask<>(SB), TT0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 320 bytes @@ -1120,11 +1120,11 @@ openAVX2320InnerCipherLoop: // Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA chacha20Constants<>(SB), AA1 + VMOVDQA ·chacha20Constants<>(SB), AA1 VMOVDQA state1StoreAVX2, BB1 VMOVDQA state2StoreAVX2, CC1 VMOVDQA ctr3StoreAVX2, DD1 - VPADDD avx2IncMask<>(SB), DD1, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD1 VMOVDQA DD1, DD0 XORQ itr2, itr2 @@ -1153,7 +1153,7 @@ openAVX2Tail128LoopB: CMPQ itr2, $160 JNE openAVX2Tail128LoopB - VPADDD chacha20Constants<>(SB), AA1, AA1 + VPADDD ·chacha20Constants<>(SB), AA1, AA1 VPADDD state1StoreAVX2, BB1, BB1 VPADDD state2StoreAVX2, CC1, CC1 VPADDD DD0, DD1, DD1 @@ -1196,12 +1196,12 @@ openAVX2TailDone: // Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: // Need to decrypt up to 256 bytes - prepare four blocks - VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 VMOVDQA ctr3StoreAVX2, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA DD0, TT1 VMOVDQA DD1, TT2 @@ -1255,7 +1255,7 @@ openAVX2Tail256Hash: // Store 128 bytes safely, then go to store loop openAVX2Tail256HashEnd: - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 @@ -1274,13 +1274,13 @@ openAVX2Tail256HashEnd: // Special optimization for the last 384 bytes of ciphertext openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 VMOVDQA ctr3StoreAVX2, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD1 - VPADDD avx2IncMask<>(SB), DD1, DD2 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA DD0, ctr0StoreAVX2 VMOVDQA DD1, ctr1StoreAVX2 VMOVDQA DD2, ctr2StoreAVX2 @@ -1339,7 +1339,7 @@ openAVX2Tail384Hash: // Store 256 bytes safely, then go to store loop openAVX2Tail384HashEnd: - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 @@ -1358,10 +1358,10 @@ openAVX2Tail384HashEnd: // ---------------------------------------------------------------------------- // Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: - VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 XORQ itr1, itr1 MOVQ inp, itr2 @@ -1374,7 +1374,7 @@ openAVX2Tail512LoopB: openAVX2Tail512LoopA: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 @@ -1387,7 +1387,7 @@ openAVX2Tail512LoopA: polyMulAVX2 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 @@ -1401,7 +1401,7 @@ openAVX2Tail512LoopA: VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyAdd(2*8(itr2)) @@ -1415,7 +1415,7 @@ openAVX2Tail512LoopA: VMOVDQA tmpStoreAVX2, CC3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 @@ -1448,7 +1448,7 @@ openAVX2Tail512HashLoop: JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 @@ -1493,7 +1493,7 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 JBE sealSSE128 // About 15% faster // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU chacha20Constants<>(SB), A0 + MOVOU ·chacha20Constants<>(SB), A0 MOVOU (1*16)(keyp), B0 MOVOU (2*16)(keyp), C0 MOVOU (3*16)(keyp), D0 @@ -1503,9 +1503,9 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 MOVO C0, state2Store // Load state, increment counter blocks - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store @@ -1535,13 +1535,13 @@ sealSSEIntroLoop: JNE sealSSEIntroLoop // Add in the state - PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 // Clamp and store the key - PAND polyClampMask<>(SB), A0 + PAND ·polyClampMask<>(SB), A0 MOVO A0, rStore MOVO B0, sStore @@ -1585,10 +1585,10 @@ sealSSEIntroLoop: sealSSEMainLoop: // Load state, increment counter blocks - MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store @@ -1627,7 +1627,7 @@ sealSSEInnerLoop: JG sealSSEInnerLoop // Add in the state - PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 @@ -1683,11 +1683,11 @@ sealSSEInnerLoop: // Special optimization for the last 64 bytes of plaintext sealSSETail64: // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes - MOVO chacha20Constants<>(SB), A1 + MOVO ·chacha20Constants<>(SB), A1 MOVO state1Store, B1 MOVO state2Store, C1 MOVO ctr3Store, D1 - PADDL sseIncMask<>(SB), D1 + PADDL ·sseIncMask<>(SB), D1 MOVO D1, ctr0Store sealSSETail64LoopA: @@ -1710,7 +1710,7 @@ sealSSETail64LoopB: DECQ itr2 JGE sealSSETail64LoopB - PADDL chacha20Constants<>(SB), A1 + PADDL ·chacha20Constants<>(SB), A1 PADDL state1Store, B1 PADDL state2Store, C1 PADDL ctr0Store, D1 @@ -1721,8 +1721,8 @@ sealSSETail64LoopB: // Special optimization for the last 128 bytes of plaintext sealSSETail128: // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes - MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store sealSSETail128LoopA: // Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext @@ -1747,7 +1747,7 @@ sealSSETail128LoopB: DECQ itr2 JGE sealSSETail128LoopB - PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1 + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 PADDL state1Store, B0; PADDL state1Store, B1 PADDL state2Store, C0; PADDL state2Store, C1 PADDL ctr0Store, D0; PADDL ctr1Store, D1 @@ -1766,9 +1766,9 @@ sealSSETail128LoopB: // Special optimization for the last 192 bytes of plaintext sealSSETail192: // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes - MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr2Store + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store sealSSETail192LoopA: // Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext @@ -1797,7 +1797,7 @@ sealSSETail192LoopB: DECQ itr2 JGE sealSSETail192LoopB - PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 @@ -1823,9 +1823,9 @@ sealSSETail192LoopB: // Special seal optimization for buffers smaller than 129 bytes sealSSE128: // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 MOVQ $10, itr2 @@ -1842,11 +1842,11 @@ sealSSE128InnerCipherLoop: JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2 - PAND polyClampMask<>(SB), A0 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PAND ·polyClampMask<>(SB), A0 MOVOU A0, rStore MOVOU B0, sStore @@ -1903,7 +1903,7 @@ sealSSETail: // We can only load the PT one byte at a time to avoid read after end of buffer MOVQ inl, itr2 SHLQ $4, itr2 - LEAQ andMask<>(SB), t0 + LEAQ ·andMask<>(SB), t0 MOVQ inl, itr1 LEAQ -1(inp)(inl*1), inp XORQ t2, t2 @@ -1963,11 +1963,11 @@ sealSSEFinalize: // ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER - VMOVDQU chacha20Constants<>(SB), AA0 + VMOVDQU ·chacha20Constants<>(SB), AA0 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD avx2InitMask<>(SB), DD0, DD0 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 // Special optimizations, for very short buffers CMPQ inl, $192 @@ -1979,9 +1979,9 @@ chacha20Poly1305Seal_AVX2: VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 - VPADDD avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 - VPADDD avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 - VPADDD avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 VMOVDQA DD3, ctr3StoreAVX2 MOVQ $10, itr2 @@ -2012,7 +2012,7 @@ sealAVX2IntroLoop: DECQ itr2 JNE sealAVX2IntroLoop - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 @@ -2022,7 +2022,7 @@ sealAVX2IntroLoop: VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 // Clamp and store poly key - VPAND polyClampMask<>(SB), DD0, DD0 + VPAND ·polyClampMask<>(SB), DD0, DD0 VMOVDQA DD0, rsStoreAVX2 // Hash AD @@ -2068,11 +2068,11 @@ sealAVX2IntroLoop: JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop - VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA ctr3StoreAVX2, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA CC3, tmpStoreAVX2 @@ -2100,7 +2100,7 @@ sealAVX2IntroLoop: VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 @@ -2116,10 +2116,10 @@ sealAVX2IntroLoop: sealAVX2MainLoop: // Load state, increment counter blocks, store the incremented counters - VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 MOVQ $10, itr1 @@ -2128,7 +2128,7 @@ sealAVX2InternalLoop: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage1_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulStage2_AVX2 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 @@ -2144,7 +2144,7 @@ sealAVX2InternalLoop: sealAVX2InternalLoopStart: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 polyAdd(2*8(oup)) VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage1_AVX2 @@ -2162,7 +2162,7 @@ sealAVX2InternalLoopStart: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage3_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulReduceStage VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 @@ -2178,7 +2178,7 @@ sealAVX2InternalLoopStart: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 polyMulStage2_AVX2 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage3_AVX2 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 @@ -2195,7 +2195,7 @@ sealAVX2InternalLoopStart: DECQ itr1 JNE sealAVX2InternalLoop - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 @@ -2250,7 +2250,7 @@ seal192AVX2: VMOVDQA AA0, AA1 VMOVDQA BB0, BB1 VMOVDQA CC0, CC1 - VPADDD avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA2 VMOVDQA BB0, BB2 VMOVDQA CC0, CC2 @@ -2276,7 +2276,7 @@ sealAVX2192InnerCipherLoop: VPERM2I128 $0x02, AA0, BB0, TT0 // Clamp and store poly key - VPAND polyClampMask<>(SB), TT0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 192 bytes @@ -2359,8 +2359,8 @@ sealAVX2ShortDone: // Special optimization for buffers smaller than 321 bytes seal320AVX2: // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2 + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 MOVQ $10, itr2 @@ -2376,18 +2376,18 @@ sealAVX2320InnerCipherLoop: DECQ itr2 JNE sealAVX2320InnerCipherLoop - VMOVDQA chacha20Constants<>(SB), TT0 + VMOVDQA ·chacha20Constants<>(SB), TT0 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA avx2IncMask<>(SB), TT0 + VMOVDQA ·avx2IncMask<>(SB), TT0 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 VPADDD TT3, DD2, DD2 // Clamp and store poly key VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND polyClampMask<>(SB), TT0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 320 bytes @@ -2409,11 +2409,11 @@ sealAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA chacha20Constants<>(SB), AA0 + VMOVDQA ·chacha20Constants<>(SB), AA0 VMOVDQA state1StoreAVX2, BB0 VMOVDQA state2StoreAVX2, CC0 VMOVDQA ctr3StoreAVX2, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 VMOVDQA DD0, DD1 sealAVX2Tail128LoopA: @@ -2440,7 +2440,7 @@ sealAVX2Tail128LoopB: DECQ itr2 JGE sealAVX2Tail128LoopB - VPADDD chacha20Constants<>(SB), AA0, AA1 + VPADDD ·chacha20Constants<>(SB), AA0, AA1 VPADDD state1StoreAVX2, BB0, BB1 VPADDD state2StoreAVX2, CC0, CC1 VPADDD DD1, DD0, DD1 @@ -2457,12 +2457,12 @@ sealAVX2Tail256: // Need to decrypt up to 256 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA chacha20Constants<>(SB), AA1 + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 VMOVDQA ctr3StoreAVX2, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA DD0, TT1 VMOVDQA DD1, TT2 @@ -2490,7 +2490,7 @@ sealAVX2Tail256LoopB: DECQ itr2 JGE sealAVX2Tail256LoopB - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 @@ -2516,11 +2516,11 @@ sealAVX2Tail384: // Need to decrypt up to 384 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 VMOVDQA ctr3StoreAVX2, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 sealAVX2Tail384LoopA: @@ -2547,7 +2547,7 @@ sealAVX2Tail384LoopB: DECQ itr2 JGE sealAVX2Tail384LoopB - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 @@ -2579,11 +2579,11 @@ sealAVX2Tail512: // Need to decrypt up to 512 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA ctr3StoreAVX2, DD0 - VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 sealAVX2Tail512LoopA: @@ -2594,7 +2594,7 @@ sealAVX2Tail512LoopA: sealAVX2Tail512LoopB: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 @@ -2607,7 +2607,7 @@ sealAVX2Tail512LoopB: polyMulAVX2 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 @@ -2621,7 +2621,7 @@ sealAVX2Tail512LoopB: VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyAdd(2*8(oup)) @@ -2635,7 +2635,7 @@ sealAVX2Tail512LoopB: VMOVDQA tmpStoreAVX2, CC3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 @@ -2653,7 +2653,7 @@ sealAVX2Tail512LoopB: DECQ itr2 JGE sealAVX2Tail512LoopB - VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 -- 2.48.1