-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
//go:build gc && !purego
#include "textflag.h"
-// General register allocation
-#define oup DI
-#define inp SI
-#define inl BX
-#define adp CX // free to reuse, after we hash the additional data
-#define keyp R8 // free to reuse, when we copy the key to stack
-#define itr2 R9 // general iterator
-#define itr1 CX // general iterator
-#define acc0 R10
-#define acc1 R11
-#define acc2 R12
-#define t0 R13
-#define t1 R14
-#define t2 R15
-#define t3 R8
-// Register and stack allocation for the SSE code
-#define rStore (0*16)(BP)
-#define sStore (1*16)(BP)
-#define state1Store (2*16)(BP)
-#define state2Store (3*16)(BP)
-#define tmpStore (4*16)(BP)
-#define ctr0Store (5*16)(BP)
-#define ctr1Store (6*16)(BP)
-#define ctr2Store (7*16)(BP)
-#define ctr3Store (8*16)(BP)
-#define A0 X0
-#define A1 X1
-#define A2 X2
-#define B0 X3
-#define B1 X4
-#define B2 X5
-#define C0 X6
-#define C1 X7
-#define C2 X8
-#define D0 X9
-#define D1 X10
-#define D2 X11
-#define T0 X12
-#define T1 X13
-#define T2 X14
-#define T3 X15
-#define A3 T0
-#define B3 T1
-#define C3 T2
-#define D3 T3
-// Register and stack allocation for the AVX2 code
-#define rsStoreAVX2 (0*32)(BP)
-#define state1StoreAVX2 (1*32)(BP)
-#define state2StoreAVX2 (2*32)(BP)
-#define ctr0StoreAVX2 (3*32)(BP)
-#define ctr1StoreAVX2 (4*32)(BP)
-#define ctr2StoreAVX2 (5*32)(BP)
-#define ctr3StoreAVX2 (6*32)(BP)
-#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
-#define AA0 Y0
-#define AA1 Y5
-#define AA2 Y6
-#define AA3 Y7
-#define BB0 Y14
-#define BB1 Y9
-#define BB2 Y10
-#define BB3 Y11
-#define CC0 Y12
-#define CC1 Y13
-#define CC2 Y8
-#define CC3 Y15
-#define DD0 Y4
-#define DD1 Y1
-#define DD2 Y2
-#define DD3 Y3
-#define TT0 DD3
-#define TT1 AA3
-#define TT2 BB3
-#define TT3 CC3
-// ChaCha20 constants
-DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
-DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
-// <<< 16 with PSHUFB
-DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
-// <<< 8 with PSHUFB
-DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
-DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
-
-DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
-DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
-
-DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
-DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
-// Poly1305 key clamp
-DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-
-DATA ·sseIncMask<>+0x00(SB)/8, $0x1
-DATA ·sseIncMask<>+0x08(SB)/8, $0x0
-// To load/store the last < 16 bytes in a buffer
-DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-
-GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
-GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
-GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
-// No PALIGNR in Go ASM yet (but VPALIGNR is present).
-#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
-#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
-#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
-#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
-#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
-#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
-#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
-#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
-#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
-#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
-#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
-#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
-#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
-#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
-#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
-#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
-#define shiftC0Right shiftC0Left
-#define shiftC1Right shiftC1Left
-#define shiftC2Right shiftC2Left
-#define shiftC3Right shiftC3Left
-#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
-#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
-#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
-#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
-
-// Some macros
-
-// ROL rotates the uint32s in register R left by N bits, using temporary T.
-#define ROL(N, R, T) \
- MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
-
-// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
-#else
-#define ROL16(R, T) ROL(16, R, T)
-#endif
-
-// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
-#else
-#define ROL8(R, T) ROL(8, R, T)
-#endif
-
-#define chachaQR(A, B, C, D, T) \
- PADDD B, A; PXOR A, D; ROL16(D, T) \
- PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
- PADDD B, A; PXOR A, D; ROL8(D, T) \
- PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
-
-#define chachaQR_AVX2(A, B, C, D, T) \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
- VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
- VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
-
-#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
-#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
-#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
-#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
-
-#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
-#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
-
-#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
-#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
-// ----------------------------------------------------------------------------
+
+// func polyHashADInternal<>()
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
- // adp points to beginning of additional data
- // itr2 holds ad length
- XORQ acc0, acc0
- XORQ acc1, acc1
- XORQ acc2, acc2
- CMPQ itr2, $13
- JNE hashADLoop
-
-openFastTLSAD:
- // Special treatment for the TLS case of 13 bytes
- MOVQ (adp), acc0
- MOVQ 5(adp), acc1
- SHRQ $24, acc1
- MOVQ $1, acc2
- polyMul
+ // Hack: Must declare #define macros inside of a function due to Avo constraints
+ // ROL rotates the uint32s in register R left by N bits, using temporary T.
+ #define ROL(N, R, T) \
+ MOVO R, T; \
+ PSLLL $(N), T; \
+ PSRLL $(32-(N)), R; \
+ PXOR T, R
+
+ // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
+ #ifdef GOAMD64_v2
+ #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
+ #else
+ #define ROL8(R, T) ROL(8, R, T)
+ #endif
+
+ // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
+ #ifdef GOAMD64_v2
+ #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
+ #else
+ #define ROL16(R, T) ROL(16, R, T)
+ #endif
+ XORQ R10, R10
+ XORQ R11, R11
+ XORQ R12, R12
+ CMPQ R9, $0x0d
+ JNE hashADLoop
+ MOVQ (CX), R10
+ MOVQ 5(CX), R11
+ SHRQ $0x18, R11
+ MOVQ $0x00000001, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
RET
hashADLoop:
// Hash in 16 byte chunks
- CMPQ itr2, $16
- JB hashADTail
- polyAdd(0(adp))
- LEAQ (1*16)(adp), adp
- SUBQ $16, itr2
- polyMul
- JMP hashADLoop
+ CMPQ R9, $0x10
+ JB hashADTail
+ ADDQ (CX), R10
+ ADCQ 8(CX), R11
+ ADCQ $0x01, R12
+ LEAQ 16(CX), CX
+ SUBQ $0x10, R9
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ JMP hashADLoop
hashADTail:
- CMPQ itr2, $0
+ CMPQ R9, $0x00
JE hashADDone
// Hash last < 16 byte tail
- XORQ t0, t0
- XORQ t1, t1
- XORQ t2, t2
- ADDQ itr2, adp
+ XORQ R13, R13
+ XORQ R14, R14
+ XORQ R15, R15
+ ADDQ R9, CX
hashADTailLoop:
- SHLQ $8, t0, t1
- SHLQ $8, t0
- MOVB -1(adp), t2
- XORQ t2, t0
- DECQ adp
- DECQ itr2
- JNE hashADTailLoop
-
-hashADTailFinish:
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
-
- // Finished AD
+ SHLQ $0x08, R13, R14
+ SHLQ $0x08, R13
+ MOVB -1(CX), R15
+ XORQ R15, R13
+ DECQ CX
+ DECQ R9
+ JNE hashADTailLoop
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+
hashADDone:
RET
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
-TEXT ·chacha20Poly1305Open(SB), 0, $288-97
+// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Open(SB), $288-97
// For aligned stack access
MOVQ SP, BP
- ADDQ $32, BP
+ ADDQ $0x20, BP
ANDQ $-32, BP
- MOVQ dst+0(FP), oup
- MOVQ key+24(FP), keyp
- MOVQ src+48(FP), inp
- MOVQ src_len+56(FP), inl
- MOVQ ad+72(FP), adp
+ MOVQ dst_base+0(FP), DI
+ MOVQ key_base+24(FP), R8
+ MOVQ src_base+48(FP), SI
+ MOVQ src_len+56(FP), BX
+ MOVQ ad_base+72(FP), CX
// Check for AVX2 support
- CMPB ·useAVX2(SB), $1
+ CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Open_AVX2
// Special optimization, for very short buffers
- CMPQ inl, $128
- JBE openSSE128 // About 16% faster
+ CMPQ BX, $0x80
+ JBE openSSE128
// For long buffers, prepare the poly key first
- MOVOU ·chacha20Constants<>(SB), A0
- MOVOU (1*16)(keyp), B0
- MOVOU (2*16)(keyp), C0
- MOVOU (3*16)(keyp), D0
- MOVO D0, T1
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X9, X13
// Store state on stack for future use
- MOVO B0, state1Store
- MOVO C0, state2Store
- MOVO D0, ctr3Store
- MOVQ $10, itr2
+ MOVO X3, 32(BP)
+ MOVO X6, 48(BP)
+ MOVO X9, 128(BP)
+ MOVQ $0x0000000a, R9
openSSEPreparePolyKey:
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- DECQ itr2
- JNE openSSEPreparePolyKey
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ DECQ R9
+ JNE openSSEPreparePolyKey
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL 32(BP), X3
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVO A0, rStore; MOVO B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVO X0, (BP)
+ MOVO X3, 16(BP)
// Hash AAD
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSEMainLoop:
- CMPQ inl, $256
+ CMPQ BX, $0x00000100
JB openSSEMainLoopDone
// Load state, increment counter blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
- // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
- MOVQ $4, itr1
- MOVQ inp, itr2
+ // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
+ // 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+ MOVQ $0x00000004, CX
+ MOVQ SI, R9
openSSEInternalLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyAdd(0(itr2))
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- LEAQ (2*8)(itr2), itr2
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- polyMulStage3
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr1
- JGE openSSEInternalLoop
-
- polyAdd(0(itr2))
- polyMul
- LEAQ (2*8)(itr2), itr2
-
- CMPQ itr1, $-6
- JG openSSEInternalLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ LEAQ 16(R9), R9
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ CX
+ JGE openSSEInternalLoop
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ CMPQ CX, $-6
+ JG openSSEInternalLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
// Load - xor - store
- MOVO D3, tmpStore
- MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
- MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
- MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
- MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
- MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
- MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
- MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
- MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
- MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
- MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
- MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
- MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
- MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
- MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
- LEAQ 256(inp), inp
- LEAQ 256(oup), oup
- SUBQ $256, inl
+ MOVO X15, 64(BP)
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU X0, (DI)
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU X3, 16(DI)
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU X6, 32(DI)
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X9
+ PXOR X9, X1
+ MOVOU X1, 64(DI)
+ MOVOU 80(SI), X9
+ PXOR X9, X4
+ MOVOU X4, 80(DI)
+ MOVOU 96(SI), X9
+ PXOR X9, X7
+ MOVOU X7, 96(DI)
+ MOVOU 112(SI), X9
+ PXOR X9, X10
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X9
+ PXOR X9, X2
+ MOVOU X2, 128(DI)
+ MOVOU 144(SI), X9
+ PXOR X9, X5
+ MOVOU X5, 144(DI)
+ MOVOU 160(SI), X9
+ PXOR X9, X8
+ MOVOU X8, 160(DI)
+ MOVOU 176(SI), X9
+ PXOR X9, X11
+ MOVOU X11, 176(DI)
+ MOVOU 192(SI), X9
+ PXOR X9, X12
+ MOVOU X12, 192(DI)
+ MOVOU 208(SI), X9
+ PXOR X9, X13
+ MOVOU X13, 208(DI)
+ MOVOU 224(SI), X9
+ PXOR X9, X14
+ MOVOU X14, 224(DI)
+ MOVOU 240(SI), X9
+ PXOR 64(BP), X9
+ MOVOU X9, 240(DI)
+ LEAQ 256(SI), SI
+ LEAQ 256(DI), DI
+ SUBQ $0x00000100, BX
JMP openSSEMainLoop
openSSEMainLoopDone:
// Handle the various tail sizes efficiently
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
- CMPQ inl, $64
+ CMPQ BX, $0x40
JBE openSSETail64
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE openSSETail128
- CMPQ inl, $192
+ CMPQ BX, $0xc0
JBE openSSETail192
JMP openSSETail256
openSSEFinalize:
// Hash in the PT, AAD lengths
- ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
- polyMul
+ ADDQ ad_len+80(FP), R10
+ ADCQ src_len+56(FP), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Final reduce
- MOVQ acc0, t0
- MOVQ acc1, t1
- MOVQ acc2, t2
- SUBQ $-5, acc0
- SBBQ $-1, acc1
- SBBQ $3, acc2
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
- CMOVQCS t2, acc2
+ MOVQ R10, R13
+ MOVQ R11, R14
+ MOVQ R12, R15
+ SUBQ $-5, R10
+ SBBQ $-1, R11
+ SBBQ $0x03, R12
+ CMOVQCS R13, R10
+ CMOVQCS R14, R11
+ CMOVQCS R15, R12
// Add in the "s" part of the key
- ADDQ 0+sStore, acc0
- ADCQ 8+sStore, acc1
+ ADDQ 16(BP), R10
+ ADCQ 24(BP), R11
// Finally, constant time compare to the tag at the end of the message
XORQ AX, AX
- MOVQ $1, DX
- XORQ (0*8)(inp), acc0
- XORQ (1*8)(inp), acc1
- ORQ acc1, acc0
+ MOVQ $0x00000001, DX
+ XORQ (SI), R10
+ XORQ 8(SI), R11
+ ORQ R11, R10
CMOVQEQ DX, AX
// Return true iff tags are equal
MOVB AX, ret+96(FP)
RET
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 129 bytes
openSSE128:
- // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
- MOVQ $10, itr2
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X3, X13
+ MOVO X6, X14
+ MOVO X10, X15
+ MOVQ $0x0000000a, R9
openSSE128InnerCipherLoop:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftB1Left; shiftB2Left
- shiftC0Left; shiftC1Left; shiftC2Left
- shiftD0Left; shiftD1Left; shiftD2Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftB1Right; shiftB2Right
- shiftC0Right; shiftC1Right; shiftC2Right
- shiftD0Right; shiftD1Right; shiftD2Right
- DECQ itr2
- JNE openSSE128InnerCipherLoop
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ R9
+ JNE openSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
- PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL X13, X3
+ PADDL X13, X4
+ PADDL X13, X5
+ PADDL X14, X7
+ PADDL X14, X8
+ PADDL X15, X10
+ PADDL ·sseIncMask<>+0(SB), X15
+ PADDL X15, X11
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVOU A0, rStore; MOVOU B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVOU X0, (BP)
+ MOVOU X3, 16(BP)
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSE128Open:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB openSSETail16
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for hashing
- polyAdd(0(inp))
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
// Load for decryption
- MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- polyMul
+ MOVOU (SI), X12
+ PXOR X12, X1
+ MOVOU X1, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Shift the stream "left"
- MOVO B1, A1
- MOVO C1, B1
- MOVO D1, C1
- MOVO A2, D1
- MOVO B2, A2
- MOVO C2, B2
- MOVO D2, C2
+ MOVO X4, X1
+ MOVO X7, X4
+ MOVO X10, X7
+ MOVO X2, X10
+ MOVO X5, X2
+ MOVO X8, X5
+ MOVO X11, X8
JMP openSSE128Open
openSSETail16:
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
// We can safely load the CT from the end, because it is padded with the MAC
- MOVQ inl, itr2
- SHLQ $4, itr2
- LEAQ ·andMask<>(SB), t0
- MOVOU (inp), T0
- ADDQ inl, inp
- PAND -16(t0)(itr2*1), T0
- MOVO T0, 0+tmpStore
- MOVQ T0, t0
- MOVQ 8+tmpStore, t1
- PXOR A1, T0
+ MOVQ BX, R9
+ SHLQ $0x04, R9
+ LEAQ ·andMask<>+0(SB), R13
+ MOVOU (SI), X12
+ ADDQ BX, SI
+ PAND -16(R13)(R9*1), X12
+ MOVO X12, 64(BP)
+ MOVQ X12, R13
+ MOVQ 72(BP), R14
+ PXOR X1, X12
// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
openSSETail16Store:
- MOVQ T0, t3
- MOVB t3, (oup)
- PSRLDQ $1, T0
- INCQ oup
- DECQ inl
+ MOVQ X12, R8
+ MOVB R8, (DI)
+ PSRLDQ $0x01, X12
+ INCQ DI
+ DECQ BX
JNE openSSETail16Store
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
JMP openSSEFinalize
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of ciphertext
openSSETail64:
- // Need to decrypt up to 64 bytes - prepare single block
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- XORQ itr2, itr2
- MOVQ inl, itr1
- CMPQ itr1, $16
- JB openSSETail64LoopB
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ XORQ R9, R9
+ MOVQ BX, CX
+ CMPQ CX, $0x10
+ JB openSSETail64LoopB
openSSETail64LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
- SUBQ $16, itr1
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
openSSETail64LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
-
- CMPQ itr1, $16
- JAE openSSETail64LoopA
-
- CMPQ itr2, $160
- JNE openSSETail64LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ CMPQ CX, $0x10
+ JAE openSSETail64LoopA
+ CMPQ R9, $0xa0
+ JNE openSSETail64LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL 32(BP), X3
+ PADDL 48(BP), X6
+ PADDL 80(BP), X9
openSSETail64DecLoop:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB openSSETail64DecLoopDone
- SUBQ $16, inl
- MOVOU (inp), T0
- PXOR T0, A0
- MOVOU A0, (oup)
- LEAQ 16(inp), inp
- LEAQ 16(oup), oup
- MOVO B0, A0
- MOVO C0, B0
- MOVO D0, C0
+ SUBQ $0x10, BX
+ MOVOU (SI), X12
+ PXOR X12, X0
+ MOVOU X0, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ MOVO X3, X0
+ MOVO X6, X3
+ MOVO X9, X6
JMP openSSETail64DecLoop
openSSETail64DecLoopDone:
- MOVO A0, A1
+ MOVO X0, X1
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
openSSETail128:
- // Need to decrypt up to 128 bytes - prepare two blocks
- MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
- XORQ itr2, itr2
- MOVQ inl, itr1
- ANDQ $-16, itr1
+ MOVO ·chacha20Constants<>+0(SB), X1
+ MOVO 32(BP), X4
+ MOVO 48(BP), X7
+ MOVO 128(BP), X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 80(BP)
+ MOVO X1, X0
+ MOVO X4, X3
+ MOVO X7, X6
+ MOVO X10, X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 96(BP)
+ XORQ R9, R9
+ MOVQ BX, CX
+ ANDQ $-16, CX
openSSETail128LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSETail128LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
-
- CMPQ itr2, itr1
- JB openSSETail128LoopA
-
- CMPQ itr2, $160
- JNE openSSETail128LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B0; PADDL state1Store, B1
- PADDL state2Store, C0; PADDL state2Store, C1
- PADDL ctr1Store, D0; PADDL ctr0Store, D1
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-
- SUBQ $64, inl
- LEAQ 64(inp), inp
- LEAQ 64(oup), oup
- JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of ciphertext
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ CMPQ R9, CX
+ JB openSSETail128LoopA
+ CMPQ R9, $0xa0
+ JNE openSSETail128LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 96(BP), X9
+ PADDL 80(BP), X10
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, (DI)
+ MOVOU X4, 16(DI)
+ MOVOU X7, 32(DI)
+ MOVOU X10, 48(DI)
+ SUBQ $0x40, BX
+ LEAQ 64(SI), SI
+ LEAQ 64(DI), DI
+ JMP openSSETail64DecLoop
+
openSSETail192:
- // Need to decrypt up to 192 bytes - prepare three blocks
- MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
- MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
-
- MOVQ inl, itr1
- MOVQ $160, itr2
- CMPQ itr1, $160
- CMOVQGT itr2, itr1
- ANDQ $-16, itr1
- XORQ itr2, itr2
+ MOVO ·chacha20Constants<>+0(SB), X2
+ MOVO 32(BP), X5
+ MOVO 48(BP), X8
+ MOVO 128(BP), X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X11, 80(BP)
+ MOVO X2, X1
+ MOVO X5, X4
+ MOVO X8, X7
+ MOVO X11, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
+ MOVO X1, X0
+ MOVO X4, X3
+ MOVO X7, X6
+ MOVO X10, X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 112(BP)
+ MOVQ BX, CX
+ MOVQ $0x000000a0, R9
+ CMPQ CX, $0xa0
+ CMOVQGT R9, CX
+ ANDQ $-16, CX
+ XORQ R9, R9
openSSLTail192LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSLTail192LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- shiftB2Left; shiftC2Left; shiftD2Left
-
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
- shiftB2Right; shiftC2Right; shiftD2Right
-
- CMPQ itr2, itr1
- JB openSSLTail192LoopA
-
- CMPQ itr2, $160
- JNE openSSLTail192LoopB
-
- CMPQ inl, $176
- JB openSSLTail192Store
-
- polyAdd(160(inp))
- polyMul
-
- CMPQ inl, $192
- JB openSSLTail192Store
-
- polyAdd(176(inp))
- polyMul
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ CMPQ R9, CX
+ JB openSSLTail192LoopA
+ CMPQ R9, $0xa0
+ JNE openSSLTail192LoopB
+ CMPQ BX, $0xb0
+ JB openSSLTail192Store
+ ADDQ 160(SI), R10
+ ADCQ 168(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ CMPQ BX, $0xc0
+ JB openSSLTail192Store
+ ADDQ 176(SI), R10
+ ADCQ 184(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSLTail192Store:
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
- PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
- PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
- MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
-
- MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
- SUBQ $128, inl
- LEAQ 128(inp), inp
- LEAQ 128(oup), oup
- JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 32(BP), X5
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 48(BP), X8
+ PADDL 112(BP), X9
+ PADDL 96(BP), X10
+ PADDL 80(BP), X11
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X2
+ PXOR X13, X5
+ PXOR X14, X8
+ PXOR X15, X11
+ MOVOU X2, (DI)
+ MOVOU X5, 16(DI)
+ MOVOU X8, 32(DI)
+ MOVOU X11, 48(DI)
+ MOVOU 64(SI), X12
+ MOVOU 80(SI), X13
+ MOVOU 96(SI), X14
+ MOVOU 112(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ LEAQ 128(DI), DI
+ JMP openSSETail64DecLoop
+
openSSETail256:
- // Need to decrypt up to 256 bytes - prepare four blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
- XORQ itr2, itr2
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
+ XORQ R9, R9
openSSETail256Loop:
- // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
- polyAdd(0(inp)(itr2*1))
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulStage3
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- ADDQ $2*8, itr2
- CMPQ itr2, $160
- JB openSSETail256Loop
- MOVQ inl, itr1
- ANDQ $-16, itr1
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ ADDQ $0x10, R9
+ CMPQ R9, $0xa0
+ JB openSSETail256Loop
+ MOVQ BX, CX
+ ANDQ $-16, CX
openSSETail256HashLoop:
- polyAdd(0(inp)(itr2*1))
- polyMul
- ADDQ $2*8, itr2
- CMPQ itr2, itr1
- JB openSSETail256HashLoop
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ $0x10, R9
+ CMPQ R9, CX
+ JB openSSETail256HashLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
- MOVO D3, tmpStore
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
+ MOVO X15, 64(BP)
// Load - xor - store
- MOVOU (0*16)(inp), D3; PXOR D3, A0
- MOVOU (1*16)(inp), D3; PXOR D3, B0
- MOVOU (2*16)(inp), D3; PXOR D3, C0
- MOVOU (3*16)(inp), D3; PXOR D3, D0
- MOVOU A0, (0*16)(oup)
- MOVOU B0, (1*16)(oup)
- MOVOU C0, (2*16)(oup)
- MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
- LEAQ 192(inp), inp
- LEAQ 192(oup), oup
- SUBQ $192, inl
- MOVO A3, A0
- MOVO B3, B0
- MOVO C3, C0
- MOVO tmpStore, D0
-
- JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X0
+ MOVOU 144(SI), X3
+ MOVOU 160(SI), X6
+ MOVOU 176(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 128(DI)
+ MOVOU X5, 144(DI)
+ MOVOU X8, 160(DI)
+ MOVOU X11, 176(DI)
+ LEAQ 192(SI), SI
+ LEAQ 192(DI), DI
+ SUBQ $0xc0, BX
+ MOVO X12, X0
+ MOVO X13, X3
+ MOVO X14, X6
+ MOVO 64(BP), X9
+ JMP openSSETail64DecLoop
+
chacha20Poly1305Open_AVX2:
VZEROUPPER
- VMOVDQU ·chacha20Constants<>(SB), AA0
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
- BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD ·avx2InitMask<>(SB), DD0, DD0
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x70
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0xc2
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x30
+ VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimization, for very short buffers
- CMPQ inl, $192
+ CMPQ BX, $0xc0
JBE openAVX2192
- CMPQ inl, $320
+ CMPQ BX, $0x00000140
JBE openAVX2320
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
- VMOVDQA BB0, state1StoreAVX2
- VMOVDQA CC0, state2StoreAVX2
- VMOVDQA DD0, ctr3StoreAVX2
- MOVQ $10, itr2
+ VMOVDQA Y14, 32(BP)
+ VMOVDQA Y12, 64(BP)
+ VMOVDQA Y4, 192(BP)
+ MOVQ $0x0000000a, R9
openAVX2PreparePolyKey:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- DECQ itr2
- JNE openAVX2PreparePolyKey
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0
- VPADDD state1StoreAVX2, BB0, BB0
- VPADDD state2StoreAVX2, CC0, CC0
- VPADDD ctr3StoreAVX2, DD0, DD0
-
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ DECQ R9
+ JNE openAVX2PreparePolyKey
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 192(BP), Y4, Y4
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for the first 64 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
// Hash AD + first 64 bytes
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
openAVX2InitialHash64:
- polyAdd(0(inp)(itr1*1))
- polyMulAVX2
- ADDQ $16, itr1
- CMPQ itr1, $64
- JNE openAVX2InitialHash64
+ ADDQ (SI)(CX*1), R10
+ ADCQ 8(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ $0x10, CX
+ CMPQ CX, $0x40
+ JNE openAVX2InitialHash64
// Decrypt the first 64 bytes
- VPXOR (0*32)(inp), AA0, AA0
- VPXOR (1*32)(inp), BB0, BB0
- VMOVDQU AA0, (0*32)(oup)
- VMOVDQU BB0, (1*32)(oup)
- LEAQ (2*32)(inp), inp
- LEAQ (2*32)(oup), oup
- SUBQ $64, inl
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y14, Y14
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y14, 32(DI)
+ LEAQ 64(SI), SI
+ LEAQ 64(DI), DI
+ SUBQ $0x40, BX
openAVX2MainLoop:
- CMPQ inl, $512
+ CMPQ BX, $0x00000200
JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- XORQ itr1, itr1
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ XORQ CX, CX
openAVX2InternalLoop:
- // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
- // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
- polyAdd(0*8(inp)(itr1*1))
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage1_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulStage2_AVX2
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyMulStage3_AVX2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- polyAdd(2*8(inp)(itr1*1))
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage1_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage2_AVX2
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage3_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulReduceStage
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(4*8(inp)(itr1*1))
- LEAQ (6*8)(itr1), itr1
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage1_AVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- polyMulStage2_AVX2
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage3_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- CMPQ itr1, $480
+ ADDQ (SI)(CX*1), R10
+ ADCQ 8(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ ADDQ 16(SI)(CX*1), R10
+ ADCQ 24(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 32(SI)(CX*1), R10
+ ADCQ 40(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ LEAQ 48(CX), CX
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ CMPQ CX, $0x000001e0
JNE openAVX2InternalLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
- polyAdd(480(inp))
- polyMulAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+ ADDQ 480(SI), R10
+ ADCQ 488(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
// and here
- polyAdd(496(inp))
- polyMulAVX2
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
- VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
- VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
- LEAQ (32*16)(inp), inp
- LEAQ (32*16)(oup), oup
- SUBQ $(32*16), inl
+ ADDQ 496(SI), R10
+ ADCQ 504(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ VPXOR 384(SI), Y0, Y0
+ VPXOR 416(SI), Y14, Y14
+ VPXOR 448(SI), Y12, Y12
+ VPXOR 480(SI), Y4, Y4
+ VMOVDQU Y0, 384(DI)
+ VMOVDQU Y14, 416(DI)
+ VMOVDQU Y12, 448(DI)
+ VMOVDQU Y4, 480(DI)
+ LEAQ 512(SI), SI
+ LEAQ 512(DI), DI
+ SUBQ $0x00000200, BX
JMP openAVX2MainLoop
openAVX2MainLoopDone:
// Handle the various tail sizes efficiently
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE openAVX2Tail128
- CMPQ inl, $256
+ CMPQ BX, $0x00000100
JBE openAVX2Tail256
- CMPQ inl, $384
+ CMPQ BX, $0x00000180
JBE openAVX2Tail384
JMP openAVX2Tail512
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
openAVX2192:
- // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
- VMOVDQA AA0, AA1
- VMOVDQA BB0, BB1
- VMOVDQA CC0, CC1
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2
- VMOVDQA BB0, BB2
- VMOVDQA CC0, CC2
- VMOVDQA DD0, DD2
- VMOVDQA DD1, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VMOVDQA Y4, Y2
+ VMOVDQA Y1, Y15
+ MOVQ $0x0000000a, R9
openAVX2192InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ R9
JNE openAVX2192InnerCipherLoop
- VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
- VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
- VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
- VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y6, Y0, Y0
+ VPADDD Y6, Y5, Y5
+ VPADDD Y10, Y14, Y14
+ VPADDD Y10, Y9, Y9
+ VPADDD Y8, Y12, Y12
+ VPADDD Y8, Y13, Y13
+ VPADDD Y2, Y4, Y4
+ VPADDD Y15, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
openAVX2ShortOpen:
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openAVX2ShortOpenLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB openAVX2ShortTail32
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for hashing
- polyAdd(0*8(inp))
- polyMulAVX2
- polyAdd(2*8(inp))
- polyMulAVX2
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(SI), R10
+ ADCQ 24(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Load for decryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
- LEAQ (1*32)(oup), oup
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
+ LEAQ 32(DI), DI
// Shift stream left
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
- VMOVDQA AA1, DD0
- VMOVDQA BB1, AA1
- VMOVDQA CC1, BB1
- VMOVDQA DD1, CC1
- VMOVDQA AA2, DD1
- VMOVDQA BB2, AA2
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
+ VMOVDQA Y5, Y4
+ VMOVDQA Y9, Y5
+ VMOVDQA Y13, Y9
+ VMOVDQA Y1, Y13
+ VMOVDQA Y6, Y1
+ VMOVDQA Y10, Y6
JMP openAVX2ShortOpenLoop
openAVX2ShortTail32:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB openAVX2ShortDone
-
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for hashing
- polyAdd(0*8(inp))
- polyMulAVX2
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Load for decryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
openAVX2ShortDone:
VZEROUPPER
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
openAVX2320:
- // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y14, Y7
+ VMOVDQA Y12, Y11
+ VMOVDQA Y4, Y15
+ MOVQ $0x0000000a, R9
openAVX2320InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ R9
JNE openAVX2320InnerCipherLoop
-
- VMOVDQA ·chacha20Constants<>(SB), TT0
- VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
- VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
- VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA ·avx2IncMask<>(SB), TT0
- VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD2, DD2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y3
+ VPADDD Y3, Y0, Y0
+ VPADDD Y3, Y5, Y5
+ VPADDD Y3, Y6, Y6
+ VPADDD Y7, Y14, Y14
+ VPADDD Y7, Y9, Y9
+ VPADDD Y7, Y10, Y10
+ VPADDD Y11, Y12, Y12
+ VPADDD Y11, Y13, Y13
+ VPADDD Y11, Y8, Y8
+ VMOVDQA ·avx2IncMask<>+0(SB), Y3
+ VPADDD Y15, Y4, Y4
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y1, Y1
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y2, Y2
// Clamp and store poly key
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
- VPERM2I128 $0x02, AA2, BB2, CC1
- VPERM2I128 $0x02, CC2, DD2, DD1
- VPERM2I128 $0x13, AA2, BB2, AA2
- VPERM2I128 $0x13, CC2, DD2, BB2
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
+ VPERM2I128 $0x02, Y6, Y10, Y13
+ VPERM2I128 $0x02, Y8, Y2, Y1
+ VPERM2I128 $0x13, Y6, Y10, Y6
+ VPERM2I128 $0x13, Y8, Y2, Y10
JMP openAVX2ShortOpen
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
- VMOVDQA ·chacha20Constants<>(SB), AA1
- VMOVDQA state1StoreAVX2, BB1
- VMOVDQA state2StoreAVX2, CC1
- VMOVDQA ctr3StoreAVX2, DD1
- VPADDD ·avx2IncMask<>(SB), DD1, DD1
- VMOVDQA DD1, DD0
-
- XORQ itr2, itr2
- MOVQ inl, itr1
- ANDQ $-16, itr1
- TESTQ itr1, itr1
- JE openAVX2Tail128LoopB
+ VMOVDQA ·chacha20Constants<>+0(SB), Y5
+ VMOVDQA 32(BP), Y9
+ VMOVDQA 64(BP), Y13
+ VMOVDQA 192(BP), Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y1
+ VMOVDQA Y1, Y4
+ XORQ R9, R9
+ MOVQ BX, CX
+ ANDQ $-16, CX
+ TESTQ CX, CX
+ JE openAVX2Tail128LoopB
openAVX2Tail128LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMulAVX2
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openAVX2Tail128LoopB:
- ADDQ $16, itr2
- chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD1, DD1, DD1
- CMPQ itr2, itr1
- JB openAVX2Tail128LoopA
- CMPQ itr2, $160
- JNE openAVX2Tail128LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC1, CC1
- VPADDD DD0, DD1, DD1
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+ ADDQ $0x10, R9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ CMPQ R9, CX
+ JB openAVX2Tail128LoopA
+ CMPQ R9, $0xa0
+ JNE openAVX2Tail128LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y4, Y1, Y1
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
openAVX2TailLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB openAVX2Tail
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for decryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
- LEAQ (1*32)(oup), oup
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
+ LEAQ 32(DI), DI
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
JMP openAVX2TailLoop
openAVX2Tail:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB openAVX2TailDone
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for decryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
openAVX2TailDone:
VZEROUPPER
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
openAVX2Tail256:
- // Need to decrypt up to 256 bytes - prepare four blocks
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA DD0, TT1
- VMOVDQA DD1, TT2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
// Compute the number of iterations that will hash data
- MOVQ inl, tmpStoreAVX2
- MOVQ inl, itr1
- SUBQ $128, itr1
- SHRQ $4, itr1
- MOVQ $10, itr2
- CMPQ itr1, $10
- CMOVQGT itr2, itr1
- MOVQ inp, inl
- XORQ itr2, itr2
+ MOVQ BX, 224(BP)
+ MOVQ BX, CX
+ SUBQ $0x80, CX
+ SHRQ $0x04, CX
+ MOVQ $0x0000000a, R9
+ CMPQ CX, $0x0a
+ CMOVQGT R9, CX
+ MOVQ SI, BX
+ XORQ R9, R9
openAVX2Tail256LoopA:
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
- // Perform ChaCha rounds, while hashing the remaining input
openAVX2Tail256LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- INCQ itr2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- CMPQ itr2, itr1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ INCQ R9
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ CMPQ R9, CX
JB openAVX2Tail256LoopA
+ CMPQ R9, $0x0a
+ JNE openAVX2Tail256LoopB
+ MOVQ BX, R9
+ SUBQ SI, BX
+ MOVQ BX, CX
+ MOVQ 224(BP), BX
- CMPQ itr2, $10
- JNE openAVX2Tail256LoopB
-
- MOVQ inl, itr2
- SUBQ inp, inl
- MOVQ inl, itr1
- MOVQ tmpStoreAVX2, inl
-
- // Hash the remainder of data (if any)
openAVX2Tail256Hash:
- ADDQ $16, itr1
- CMPQ itr1, inl
- JGT openAVX2Tail256HashEnd
- polyAdd (0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- JMP openAVX2Tail256Hash
-
-// Store 128 bytes safely, then go to store loop
+ ADDQ $0x10, CX
+ CMPQ CX, BX
+ JGT openAVX2Tail256HashEnd
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ JMP openAVX2Tail256Hash
+
openAVX2Tail256HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-
- VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
- VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
- LEAQ (4*32)(inp), inp
- LEAQ (4*32)(oup), oup
- SUBQ $4*32, inl
-
- JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y6
+ VPERM2I128 $0x02, Y12, Y4, Y10
+ VPERM2I128 $0x13, Y0, Y14, Y8
+ VPERM2I128 $0x13, Y12, Y4, Y2
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR (SI), Y6, Y6
+ VPXOR 32(SI), Y10, Y10
+ VPXOR 64(SI), Y8, Y8
+ VPXOR 96(SI), Y2, Y2
+ VMOVDQU Y6, (DI)
+ VMOVDQU Y10, 32(DI)
+ VMOVDQU Y8, 64(DI)
+ VMOVDQU Y2, 96(DI)
+ LEAQ 128(SI), SI
+ LEAQ 128(DI), DI
+ SUBQ $0x80, BX
+ JMP openAVX2TailLoop
+
openAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare six blocks
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA DD0, ctr0StoreAVX2
- VMOVDQA DD1, ctr1StoreAVX2
- VMOVDQA DD2, ctr2StoreAVX2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
// Compute the number of iterations that will hash two blocks of data
- MOVQ inl, tmpStoreAVX2
- MOVQ inl, itr1
- SUBQ $256, itr1
- SHRQ $4, itr1
- ADDQ $6, itr1
- MOVQ $10, itr2
- CMPQ itr1, $10
- CMOVQGT itr2, itr1
- MOVQ inp, inl
- XORQ itr2, itr2
-
- // Perform ChaCha rounds, while hashing the remaining input
+ MOVQ BX, 224(BP)
+ MOVQ BX, CX
+ SUBQ $0x00000100, CX
+ SHRQ $0x04, CX
+ ADDQ $0x06, CX
+ MOVQ $0x0000000a, R9
+ CMPQ CX, $0x0a
+ CMOVQGT R9, CX
+ MOVQ SI, BX
+ XORQ R9, R9
+
openAVX2Tail384LoopB:
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
openAVX2Tail384LoopA:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
- INCQ itr2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-
- CMPQ itr2, itr1
- JB openAVX2Tail384LoopB
-
- CMPQ itr2, $10
- JNE openAVX2Tail384LoopA
-
- MOVQ inl, itr2
- SUBQ inp, inl
- MOVQ inl, itr1
- MOVQ tmpStoreAVX2, inl
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
+ INCQ R9
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ CMPQ R9, CX
+ JB openAVX2Tail384LoopB
+ CMPQ R9, $0x0a
+ JNE openAVX2Tail384LoopA
+ MOVQ BX, R9
+ SUBQ SI, BX
+ MOVQ BX, CX
+ MOVQ 224(BP), BX
openAVX2Tail384Hash:
- ADDQ $16, itr1
- CMPQ itr1, inl
- JGT openAVX2Tail384HashEnd
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- JMP openAVX2Tail384Hash
-
-// Store 256 bytes safely, then go to store loop
+ ADDQ $0x10, CX
+ CMPQ CX, BX
+ JGT openAVX2Tail384HashEnd
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ JMP openAVX2Tail384Hash
+
openAVX2Tail384HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
- VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
- VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
- VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- LEAQ (8*32)(inp), inp
- LEAQ (8*32)(oup), oup
- SUBQ $8*32, inl
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y3
+ VPERM2I128 $0x02, Y13, Y1, Y7
+ VPERM2I128 $0x13, Y5, Y9, Y11
+ VPERM2I128 $0x13, Y13, Y1, Y15
+ VPXOR 128(SI), Y3, Y3
+ VPXOR 160(SI), Y7, Y7
+ VPXOR 192(SI), Y11, Y11
+ VPXOR 224(SI), Y15, Y15
+ VMOVDQU Y3, 128(DI)
+ VMOVDQU Y7, 160(DI)
+ VMOVDQU Y11, 192(DI)
+ VMOVDQU Y15, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ LEAQ 256(SI), SI
+ LEAQ 256(DI), DI
+ SUBQ $0x00000100, BX
JMP openAVX2TailLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
openAVX2Tail512:
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- XORQ itr1, itr1
- MOVQ inp, itr2
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ XORQ CX, CX
+ MOVQ SI, R9
openAVX2Tail512LoopB:
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ (2*8)(itr2), itr2
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
openAVX2Tail512LoopA:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyAdd(0*8(itr2))
- polyMulAVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(2*8(itr2))
- polyMulAVX2
- LEAQ (4*8)(itr2), itr2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- INCQ itr1
- CMPQ itr1, $4
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 16(R9), R10
+ ADCQ 24(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(R9), R9
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ INCQ CX
+ CMPQ CX, $0x04
JLT openAVX2Tail512LoopB
-
- CMPQ itr1, $10
- JNE openAVX2Tail512LoopA
-
- MOVQ inl, itr1
- SUBQ $384, itr1
- ANDQ $-16, itr1
+ CMPQ CX, $0x0a
+ JNE openAVX2Tail512LoopA
+ MOVQ BX, CX
+ SUBQ $0x00000180, CX
+ ANDQ $-16, CX
openAVX2Tail512HashLoop:
- TESTQ itr1, itr1
+ TESTQ CX, CX
JE openAVX2Tail512HashEnd
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- SUBQ $16, itr1
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ SUBQ $0x10, CX
JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
- LEAQ (12*32)(inp), inp
- LEAQ (12*32)(oup), oup
- SUBQ $12*32, inl
-
- JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Seal(dst, key, src, ad []byte)
-TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
- // For aligned stack access
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ LEAQ 384(SI), SI
+ LEAQ 384(DI), DI
+ SUBQ $0x00000180, BX
+ JMP openAVX2TailLoop
+
+DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
+GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
+
+DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
+DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
+DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
+DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
+GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
+
+DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
+DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
+GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
+
+DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+8(SB)/8, $0x0000000000000000
+DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+24(SB)/8, $0x0000000000000000
+DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+40(SB)/8, $0x0000000000000000
+DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+56(SB)/8, $0x0000000000000000
+DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+72(SB)/8, $0x0000000000000000
+DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+88(SB)/8, $0x0000000000000000
+DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+104(SB)/8, $0x0000000000000000
+DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+120(SB)/8, $0x0000000000000000
+DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
+GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
+
+DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
+DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol16<>+0(SB)/8, $0x0504070601000302
+DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
+DATA ·rol16<>+16(SB)/8, $0x0504070601000302
+DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol8<>+0(SB)/8, $0x0605040702010003
+DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
+DATA ·rol8<>+16(SB)/8, $0x0605040702010003
+DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
+GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
+
+DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
+
+// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Seal(SB), $288-96
MOVQ SP, BP
- ADDQ $32, BP
+ ADDQ $0x20, BP
ANDQ $-32, BP
- MOVQ dst+0(FP), oup
- MOVQ key+24(FP), keyp
- MOVQ src+48(FP), inp
- MOVQ src_len+56(FP), inl
- MOVQ ad+72(FP), adp
-
- CMPB ·useAVX2(SB), $1
+ MOVQ dst_base+0(FP), DI
+ MOVQ key_base+24(FP), R8
+ MOVQ src_base+48(FP), SI
+ MOVQ src_len+56(FP), BX
+ MOVQ ad_base+72(FP), CX
+ CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Seal_AVX2
// Special optimization, for very short buffers
- CMPQ inl, $128
- JBE sealSSE128 // About 15% faster
+ CMPQ BX, $0x80
+ JBE sealSSE128
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
- MOVOU ·chacha20Constants<>(SB), A0
- MOVOU (1*16)(keyp), B0
- MOVOU (2*16)(keyp), C0
- MOVOU (3*16)(keyp), D0
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
// Store state on stack for future use
- MOVO B0, state1Store
- MOVO C0, state2Store
+ MOVO X3, 32(BP)
+ MOVO X6, 48(BP)
// Load state, increment counter blocks
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
- MOVQ $10, itr2
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
+ MOVQ $0x0000000a, R9
sealSSEIntroLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
-
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr2
- JNE sealSSEIntroLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ R9
+ JNE sealSSEIntroLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVO A0, rStore
- MOVO B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVO X0, (BP)
+ MOVO X3, 16(BP)
// Hash AAD
- MOVQ ad_len+80(FP), itr2
- CALL polyHashADInternal<>(SB)
-
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
-
- MOVQ $128, itr1
- SUBQ $128, inl
- LEAQ 128(inp), inp
-
- MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
-
- CMPQ inl, $64
- JBE sealSSE128SealHash
-
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
- MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
-
- ADDQ $64, itr1
- SUBQ $64, inl
- LEAQ 64(inp), inp
-
- MOVQ $2, itr1
- MOVQ $8, itr2
-
- CMPQ inl, $64
- JBE sealSSETail64
- CMPQ inl, $128
- JBE sealSSETail128
- CMPQ inl, $192
- JBE sealSSETail192
+ MOVQ ad_len+80(FP), R9
+ CALL polyHashADInternal<>(SB)
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, (DI)
+ MOVOU X4, 16(DI)
+ MOVOU X7, 32(DI)
+ MOVOU X10, 48(DI)
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 64(DI)
+ MOVOU X5, 80(DI)
+ MOVOU X8, 96(DI)
+ MOVOU X11, 112(DI)
+ MOVQ $0x00000080, CX
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ MOVO X12, X1
+ MOVO X13, X4
+ MOVO X14, X7
+ MOVO X15, X10
+ CMPQ BX, $0x40
+ JBE sealSSE128SealHash
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X12
+ PXOR X3, X13
+ PXOR X6, X14
+ PXOR X9, X15
+ MOVOU X12, 128(DI)
+ MOVOU X13, 144(DI)
+ MOVOU X14, 160(DI)
+ MOVOU X15, 176(DI)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ LEAQ 64(SI), SI
+ MOVQ $0x00000002, CX
+ MOVQ $0x00000008, R9
+ CMPQ BX, $0x40
+ JBE sealSSETail64
+ CMPQ BX, $0x80
+ JBE sealSSETail128
+ CMPQ BX, $0xc0
+ JBE sealSSETail192
sealSSEMainLoop:
// Load state, increment counter blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
sealSSEInnerLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyAdd(0(oup))
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- LEAQ (2*8)(oup), oup
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- polyMulStage3
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr2
- JGE sealSSEInnerLoop
- polyAdd(0(oup))
- polyMul
- LEAQ (2*8)(oup), oup
- DECQ itr1
- JG sealSSEInnerLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ LEAQ 16(DI), DI
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ R9
+ JGE sealSSEInnerLoop
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ DECQ CX
+ JG sealSSEInnerLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
- MOVO D3, tmpStore
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
+ MOVO X15, 64(BP)
// Load - xor - store
- MOVOU (0*16)(inp), D3; PXOR D3, A0
- MOVOU (1*16)(inp), D3; PXOR D3, B0
- MOVOU (2*16)(inp), D3; PXOR D3, C0
- MOVOU (3*16)(inp), D3; PXOR D3, D0
- MOVOU A0, (0*16)(oup)
- MOVOU B0, (1*16)(oup)
- MOVOU C0, (2*16)(oup)
- MOVOU D0, (3*16)(oup)
- MOVO tmpStore, D3
-
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
- ADDQ $192, inp
- MOVQ $192, itr1
- SUBQ $192, inl
- MOVO A3, A1
- MOVO B3, B1
- MOVO C3, C1
- MOVO D3, D1
- CMPQ inl, $64
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVO 64(BP), X15
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X0
+ MOVOU 144(SI), X3
+ MOVOU 160(SI), X6
+ MOVOU 176(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 128(DI)
+ MOVOU X5, 144(DI)
+ MOVOU X8, 160(DI)
+ MOVOU X11, 176(DI)
+ ADDQ $0xc0, SI
+ MOVQ $0x000000c0, CX
+ SUBQ $0xc0, BX
+ MOVO X12, X1
+ MOVO X13, X4
+ MOVO X14, X7
+ MOVO X15, X10
+ CMPQ BX, $0x40
JBE sealSSE128SealHash
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
- MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
- LEAQ 64(inp), inp
- SUBQ $64, inl
- MOVQ $6, itr1
- MOVQ $4, itr2
- CMPQ inl, $192
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X12
+ PXOR X3, X13
+ PXOR X6, X14
+ PXOR X9, X15
+ MOVOU X12, 192(DI)
+ MOVOU X13, 208(DI)
+ MOVOU X14, 224(DI)
+ MOVOU X15, 240(DI)
+ LEAQ 64(SI), SI
+ SUBQ $0x40, BX
+ MOVQ $0x00000006, CX
+ MOVQ $0x00000004, R9
+ CMPQ BX, $0xc0
JG sealSSEMainLoop
-
- MOVQ inl, itr1
- TESTQ inl, inl
+ MOVQ BX, CX
+ TESTQ BX, BX
JE sealSSE128SealHash
- MOVQ $6, itr1
- CMPQ inl, $64
+ MOVQ $0x00000006, CX
+ CMPQ BX, $0x40
JBE sealSSETail64
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE sealSSETail128
JMP sealSSETail192
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of plaintext
sealSSETail64:
- // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A1
- MOVO state1Store, B1
- MOVO state2Store, C1
- MOVO ctr3Store, D1
- PADDL ·sseIncMask<>(SB), D1
- MOVO D1, ctr0Store
+ MOVO ·chacha20Constants<>+0(SB), X1
+ MOVO 32(BP), X4
+ MOVO 48(BP), X7
+ MOVO 128(BP), X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 80(BP)
sealSSETail64LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail64LoopB:
- chachaQR(A1, B1, C1, D1, T1)
- shiftB1Left; shiftC1Left; shiftD1Left
- chachaQR(A1, B1, C1, D1, T1)
- shiftB1Right; shiftC1Right; shiftD1Right
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
-
- DECQ itr1
- JG sealSSETail64LoopA
-
- DECQ itr2
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x0c, X13
+ PSRLL $0x14, X4
+ PXOR X13, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x07, X13
+ PSRLL $0x19, X4
+ PXOR X13, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x0c, X13
+ PSRLL $0x14, X4
+ PXOR X13, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x07, X13
+ PSRLL $0x19, X4
+ PXOR X13, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ DECQ CX
+ JG sealSSETail64LoopA
+ DECQ R9
JGE sealSSETail64LoopB
- PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B1
- PADDL state2Store, C1
- PADDL ctr0Store, D1
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X4
+ PADDL 48(BP), X7
+ PADDL 80(BP), X10
+ JMP sealSSE128Seal
- JMP sealSSE128Seal
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of plaintext
sealSSETail128:
- // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
sealSSETail128LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail128LoopB:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
-
- DECQ itr1
- JG sealSSETail128LoopA
-
- DECQ itr2
- JGE sealSSETail128LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B0; PADDL state1Store, B1
- PADDL state2Store, C0; PADDL state2Store, C1
- PADDL ctr0Store, D0; PADDL ctr1Store, D1
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
- MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-
- MOVQ $64, itr1
- LEAQ 64(inp), inp
- SUBQ $64, inl
-
- JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of plaintext
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ DECQ CX
+ JG sealSSETail128LoopA
+ DECQ R9
+ JGE sealSSETail128LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 80(BP), X9
+ PADDL 96(BP), X10
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X0
+ PXOR X13, X3
+ PXOR X14, X6
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVQ $0x00000040, CX
+ LEAQ 64(SI), SI
+ SUBQ $0x40, BX
+ JMP sealSSE128SealHash
+
sealSSETail192:
- // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X11, 112(BP)
sealSSETail192LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail192LoopB:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- shiftB2Left; shiftC2Left; shiftD2Left
-
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
-
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
- shiftB2Right; shiftC2Right; shiftD2Right
-
- DECQ itr1
- JG sealSSETail192LoopA
-
- DECQ itr2
- JGE sealSSETail192LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
- PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
- PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
- MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
- MOVO A2, A1
- MOVO B2, B1
- MOVO C2, C1
- MOVO D2, D1
- MOVQ $128, itr1
- LEAQ 128(inp), inp
- SUBQ $128, inl
-
- JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special seal optimization for buffers smaller than 129 bytes
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ CX
+ JG sealSSETail192LoopA
+ DECQ R9
+ JGE sealSSETail192LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 32(BP), X5
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 48(BP), X8
+ PADDL 80(BP), X9
+ PADDL 96(BP), X10
+ PADDL 112(BP), X11
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X0
+ PXOR X13, X3
+ PXOR X14, X6
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X12
+ MOVOU 80(SI), X13
+ MOVOU 96(SI), X14
+ MOVOU 112(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVO X2, X1
+ MOVO X5, X4
+ MOVO X8, X7
+ MOVO X11, X10
+ MOVQ $0x00000080, CX
+ LEAQ 128(SI), SI
+ SUBQ $0x80, BX
+ JMP sealSSE128SealHash
+
sealSSE128:
- // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
- MOVQ $10, itr2
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X3, X13
+ MOVO X6, X14
+ MOVO X10, X15
+ MOVQ $0x0000000a, R9
sealSSE128InnerCipherLoop:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftB1Left; shiftB2Left
- shiftC0Left; shiftC1Left; shiftC2Left
- shiftD0Left; shiftD1Left; shiftD2Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftB1Right; shiftB2Right
- shiftC0Right; shiftC1Right; shiftC2Right
- shiftD0Right; shiftD1Right; shiftD2Right
- DECQ itr2
- JNE sealSSE128InnerCipherLoop
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ R9
+ JNE sealSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
- PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
- PAND ·polyClampMask<>(SB), A0
- MOVOU A0, rStore
- MOVOU B0, sStore
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL X13, X3
+ PADDL X13, X4
+ PADDL X13, X5
+ PADDL X14, X7
+ PADDL X14, X8
+ PADDL X15, X10
+ PADDL ·sseIncMask<>+0(SB), X15
+ PADDL X15, X11
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVOU X0, (BP)
+ MOVOU X3, 16(BP)
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
sealSSE128SealHash:
- // itr1 holds the number of bytes encrypted but not yet hashed
- CMPQ itr1, $16
- JB sealSSE128Seal
- polyAdd(0(oup))
- polyMul
-
- SUBQ $16, itr1
- ADDQ $16, oup
-
- JMP sealSSE128SealHash
+ CMPQ CX, $0x10
+ JB sealSSE128Seal
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
+ ADDQ $0x10, DI
+ JMP sealSSE128SealHash
sealSSE128Seal:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB sealSSETail
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for decryption
- MOVOU (inp), T0
- PXOR T0, A1
- MOVOU A1, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
+ MOVOU (SI), X12
+ PXOR X12, X1
+ MOVOU X1, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
// Extract for hashing
- MOVQ A1, t0
- PSRLDQ $8, A1
- MOVQ A1, t1
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
+ MOVQ X1, R13
+ PSRLDQ $0x08, X1
+ MOVQ X1, R14
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Shift the stream "left"
- MOVO B1, A1
- MOVO C1, B1
- MOVO D1, C1
- MOVO A2, D1
- MOVO B2, A2
- MOVO C2, B2
- MOVO D2, C2
+ MOVO X4, X1
+ MOVO X7, X4
+ MOVO X10, X7
+ MOVO X2, X10
+ MOVO X5, X2
+ MOVO X8, X5
+ MOVO X11, X8
JMP sealSSE128Seal
sealSSETail:
- TESTQ inl, inl
+ TESTQ BX, BX
JE sealSSEFinalize
// We can only load the PT one byte at a time to avoid read after end of buffer
- MOVQ inl, itr2
- SHLQ $4, itr2
- LEAQ ·andMask<>(SB), t0
- MOVQ inl, itr1
- LEAQ -1(inp)(inl*1), inp
- XORQ t2, t2
- XORQ t3, t3
+ MOVQ BX, R9
+ SHLQ $0x04, R9
+ LEAQ ·andMask<>+0(SB), R13
+ MOVQ BX, CX
+ LEAQ -1(SI)(BX*1), SI
+ XORQ R15, R15
+ XORQ R8, R8
XORQ AX, AX
sealSSETailLoadLoop:
- SHLQ $8, t2, t3
- SHLQ $8, t2
- MOVB (inp), AX
- XORQ AX, t2
- LEAQ -1(inp), inp
- DECQ itr1
+ SHLQ $0x08, R15, R8
+ SHLQ $0x08, R15
+ MOVB (SI), AX
+ XORQ AX, R15
+ LEAQ -1(SI), SI
+ DECQ CX
JNE sealSSETailLoadLoop
- MOVQ t2, 0+tmpStore
- MOVQ t3, 8+tmpStore
- PXOR 0+tmpStore, A1
- MOVOU A1, (oup)
- MOVOU -16(t0)(itr2*1), T0
- PAND T0, A1
- MOVQ A1, t0
- PSRLDQ $8, A1
- MOVQ A1, t1
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
-
- ADDQ inl, oup
+ MOVQ R15, 64(BP)
+ MOVQ R8, 72(BP)
+ PXOR 64(BP), X1
+ MOVOU X1, (DI)
+ MOVOU -16(R13)(R9*1), X12
+ PAND X12, X1
+ MOVQ X1, R13
+ PSRLDQ $0x08, X1
+ MOVQ X1, R14
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ BX, DI
sealSSEFinalize:
// Hash in the buffer lengths
- ADDQ ad_len+80(FP), acc0
- ADCQ src_len+56(FP), acc1
- ADCQ $1, acc2
- polyMul
+ ADDQ ad_len+80(FP), R10
+ ADCQ src_len+56(FP), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Final reduce
- MOVQ acc0, t0
- MOVQ acc1, t1
- MOVQ acc2, t2
- SUBQ $-5, acc0
- SBBQ $-1, acc1
- SBBQ $3, acc2
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
- CMOVQCS t2, acc2
+ MOVQ R10, R13
+ MOVQ R11, R14
+ MOVQ R12, R15
+ SUBQ $-5, R10
+ SBBQ $-1, R11
+ SBBQ $0x03, R12
+ CMOVQCS R13, R10
+ CMOVQCS R14, R11
+ CMOVQCS R15, R12
// Add in the "s" part of the key
- ADDQ 0+sStore, acc0
- ADCQ 8+sStore, acc1
+ ADDQ 16(BP), R10
+ ADCQ 24(BP), R11
// Finally store the tag at the end of the message
- MOVQ acc0, (0*8)(oup)
- MOVQ acc1, (1*8)(oup)
+ MOVQ R10, (DI)
+ MOVQ R11, 8(DI)
RET
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Seal_AVX2:
VZEROUPPER
- VMOVDQU ·chacha20Constants<>(SB), AA0
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
- BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD ·avx2InitMask<>(SB), DD0, DD0
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x70
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0xc2
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x30
+ VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimizations, for very short buffers
- CMPQ inl, $192
- JBE seal192AVX2 // 33% faster
- CMPQ inl, $320
- JBE seal320AVX2 // 17% faster
+ CMPQ BX, $0x000000c0
+ JBE seal192AVX2
+ CMPQ BX, $0x00000140
+ JBE seal320AVX2
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
- VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
- VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
- VMOVDQA DD3, ctr3StoreAVX2
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA Y14, 32(BP)
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA Y12, 64(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, 96(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y1, 128(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ MOVQ $0x0000000a, R9
sealAVX2IntroLoop:
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
- VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
- VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
- VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
- VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
- DECQ itr2
- JNE sealAVX2IntroLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-
- VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
- VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
- VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ R9
+ JNE sealAVX2IntroLoop
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPERM2I128 $0x02, Y0, Y14, Y4
+ VPERM2I128 $0x13, Y0, Y14, Y0
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), DD0, DD0
- VMOVDQA DD0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y4, Y4
+ VMOVDQA Y4, (BP)
// Hash AD
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
// Can store at least 320 bytes
- VPXOR (0*32)(inp), AA0, AA0
- VPXOR (1*32)(inp), CC0, CC0
- VMOVDQU AA0, (0*32)(oup)
- VMOVDQU CC0, (1*32)(oup)
-
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
- VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
- VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
-
- MOVQ $320, itr1
- SUBQ $320, inl
- LEAQ 320(inp), inp
-
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
- CMPQ inl, $128
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y12, Y12
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y12, 32(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 64(SI), Y0, Y0
+ VPXOR 96(SI), Y14, Y14
+ VPXOR 128(SI), Y12, Y12
+ VPXOR 160(SI), Y4, Y4
+ VMOVDQU Y0, 64(DI)
+ VMOVDQU Y14, 96(DI)
+ VMOVDQU Y12, 128(DI)
+ VMOVDQU Y4, 160(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 192(SI), Y0, Y0
+ VPXOR 224(SI), Y14, Y14
+ VPXOR 256(SI), Y12, Y12
+ VPXOR 288(SI), Y4, Y4
+ VMOVDQU Y0, 192(DI)
+ VMOVDQU Y14, 224(DI)
+ VMOVDQU Y12, 256(DI)
+ VMOVDQU Y4, 288(DI)
+ MOVQ $0x00000140, CX
+ SUBQ $0x00000140, BX
+ LEAQ 320(SI), SI
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, Y15, Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, Y15, Y3, Y4
+ CMPQ BX, $0x80
JBE sealAVX2SealHash
-
- VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
- VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
- SUBQ $128, inl
- LEAQ 128(inp), inp
-
- MOVQ $8, itr1
- MOVQ $2, itr2
-
- CMPQ inl, $128
- JBE sealAVX2Tail128
- CMPQ inl, $256
- JBE sealAVX2Tail256
- CMPQ inl, $384
- JBE sealAVX2Tail384
- CMPQ inl, $512
- JBE sealAVX2Tail512
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y14, Y14
+ VPXOR 64(SI), Y12, Y12
+ VPXOR 96(SI), Y4, Y4
+ VMOVDQU Y0, 320(DI)
+ VMOVDQU Y14, 352(DI)
+ VMOVDQU Y12, 384(DI)
+ VMOVDQU Y4, 416(DI)
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ MOVQ $0x00000008, CX
+ MOVQ $0x00000002, R9
+ CMPQ BX, $0x80
+ JBE sealAVX2Tail128
+ CMPQ BX, $0x00000100
+ JBE sealAVX2Tail256
+ CMPQ BX, $0x00000180
+ JBE sealAVX2Tail384
+ CMPQ BX, $0x00000200
+ JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
- VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
- VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
- VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
- VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
-
- SUBQ $16, oup // Adjust the pointer
- MOVQ $9, itr1
- JMP sealAVX2InternalLoopStart
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ SUBQ $0x10, DI
+ MOVQ $0x00000009, CX
+ JMP sealAVX2InternalLoopStart
sealAVX2MainLoop:
- // Load state, increment counter blocks, store the incremented counters
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- MOVQ $10, itr1
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ MOVQ $0x0000000a, CX
sealAVX2InternalLoop:
- polyAdd(0*8(oup))
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage1_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulStage2_AVX2
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyMulStage3_AVX2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
sealAVX2InternalLoopStart:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- polyAdd(2*8(oup))
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage1_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage2_AVX2
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage3_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulReduceStage
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(4*8(oup))
- LEAQ (6*8)(oup), oup
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage1_AVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- polyMulStage2_AVX2
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage3_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- DECQ itr1
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 32(DI), R10
+ ADCQ 40(DI), R11
+ ADCQ $0x01, R12
+ LEAQ 48(DI), DI
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ CX
JNE sealAVX2InternalLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
- polyAdd(0*8(oup))
- polyMulAVX2
- LEAQ (4*8)(oup), oup
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
// and here
- polyAdd(-2*8(oup))
- polyMulAVX2
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
- VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
- VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
- LEAQ (32*16)(inp), inp
- SUBQ $(32*16), inl
- CMPQ inl, $512
+ ADDQ -16(DI), R10
+ ADCQ -8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ VPXOR 384(SI), Y0, Y0
+ VPXOR 416(SI), Y14, Y14
+ VPXOR 448(SI), Y12, Y12
+ VPXOR 480(SI), Y4, Y4
+ VMOVDQU Y0, 384(DI)
+ VMOVDQU Y14, 416(DI)
+ VMOVDQU Y12, 448(DI)
+ VMOVDQU Y4, 480(DI)
+ LEAQ 512(SI), SI
+ SUBQ $0x00000200, BX
+ CMPQ BX, $0x00000200
JG sealAVX2MainLoop
// Tail can only hash 480 bytes
- polyAdd(0*8(oup))
- polyMulAVX2
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ 32(oup), oup
-
- MOVQ $10, itr1
- MOVQ $0, itr2
- CMPQ inl, $128
- JBE sealAVX2Tail128
- CMPQ inl, $256
- JBE sealAVX2Tail256
- CMPQ inl, $384
- JBE sealAVX2Tail384
- JMP sealAVX2Tail512
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ MOVQ $0x0000000a, CX
+ MOVQ $0x00000000, R9
+ CMPQ BX, $0x80
+ JBE sealAVX2Tail128
+ CMPQ BX, $0x00000100
+ JBE sealAVX2Tail256
+ CMPQ BX, $0x00000180
+ JBE sealAVX2Tail384
+ JMP sealAVX2Tail512
+
seal192AVX2:
- // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
- VMOVDQA AA0, AA1
- VMOVDQA BB0, BB1
- VMOVDQA CC0, CC1
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2
- VMOVDQA BB0, BB2
- VMOVDQA CC0, CC2
- VMOVDQA DD0, DD2
- VMOVDQA DD1, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VMOVDQA Y4, Y2
+ VMOVDQA Y1, Y15
+ MOVQ $0x0000000a, R9
sealAVX2192InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ R9
JNE sealAVX2192InnerCipherLoop
- VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
- VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
- VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
- VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y6, Y0, Y0
+ VPADDD Y6, Y5, Y5
+ VPADDD Y10, Y14, Y14
+ VPADDD Y10, Y9, Y9
+ VPADDD Y8, Y12, Y12
+ VPADDD Y8, Y13, Y13
+ VPADDD Y2, Y4, Y4
+ VPADDD Y15, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
sealAVX2ShortSeal:
// Hash aad
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
sealAVX2SealHash:
// itr1 holds the number of bytes encrypted but not yet hashed
- CMPQ itr1, $16
- JB sealAVX2ShortSealLoop
- polyAdd(0(oup))
- polyMul
- SUBQ $16, itr1
- ADDQ $16, oup
- JMP sealAVX2SealHash
+ CMPQ CX, $0x10
+ JB sealAVX2ShortSealLoop
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
+ ADDQ $0x10, DI
+ JMP sealAVX2SealHash
sealAVX2ShortSealLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB sealAVX2ShortTail32
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for encryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
// Now can hash
- polyAdd(0*8(oup))
- polyMulAVX2
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ (1*32)(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
// Shift stream left
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
- VMOVDQA AA1, DD0
- VMOVDQA BB1, AA1
- VMOVDQA CC1, BB1
- VMOVDQA DD1, CC1
- VMOVDQA AA2, DD1
- VMOVDQA BB2, AA2
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
+ VMOVDQA Y5, Y4
+ VMOVDQA Y9, Y5
+ VMOVDQA Y13, Y9
+ VMOVDQA Y1, Y13
+ VMOVDQA Y6, Y1
+ VMOVDQA Y10, Y6
JMP sealAVX2ShortSealLoop
sealAVX2ShortTail32:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB sealAVX2ShortDone
-
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for encryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
// Hash
- polyAdd(0*8(oup))
- polyMulAVX2
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
sealAVX2ShortDone:
VZEROUPPER
JMP sealSSETail
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
seal320AVX2:
- // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y14, Y7
+ VMOVDQA Y12, Y11
+ VMOVDQA Y4, Y15
+ MOVQ $0x0000000a, R9
sealAVX2320InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ R9
JNE sealAVX2320InnerCipherLoop
-
- VMOVDQA ·chacha20Constants<>(SB), TT0
- VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
- VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
- VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA ·avx2IncMask<>(SB), TT0
- VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD2, DD2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y3
+ VPADDD Y3, Y0, Y0
+ VPADDD Y3, Y5, Y5
+ VPADDD Y3, Y6, Y6
+ VPADDD Y7, Y14, Y14
+ VPADDD Y7, Y9, Y9
+ VPADDD Y7, Y10, Y10
+ VPADDD Y11, Y12, Y12
+ VPADDD Y11, Y13, Y13
+ VPADDD Y11, Y8, Y8
+ VMOVDQA ·avx2IncMask<>+0(SB), Y3
+ VPADDD Y15, Y4, Y4
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y1, Y1
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y2, Y2
// Clamp and store poly key
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
- VPERM2I128 $0x02, AA2, BB2, CC1
- VPERM2I128 $0x02, CC2, DD2, DD1
- VPERM2I128 $0x13, AA2, BB2, AA2
- VPERM2I128 $0x13, CC2, DD2, BB2
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
+ VPERM2I128 $0x02, Y6, Y10, Y13
+ VPERM2I128 $0x02, Y8, Y2, Y1
+ VPERM2I128 $0x13, Y6, Y10, Y6
+ VPERM2I128 $0x13, Y8, Y2, Y10
JMP sealAVX2ShortSeal
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
sealAVX2Tail128:
- // Need to decrypt up to 128 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0
- VMOVDQA state1StoreAVX2, BB0
- VMOVDQA state2StoreAVX2, CC0
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VMOVDQA DD0, DD1
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA 32(BP), Y14
+ VMOVDQA 64(BP), Y12
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VMOVDQA Y4, Y1
sealAVX2Tail128LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail128LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0
- VPALIGNR $8, CC0, CC0, CC0
- VPALIGNR $12, DD0, DD0, DD0
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0
- VPALIGNR $8, CC0, CC0, CC0
- VPALIGNR $4, DD0, DD0, DD0
- DECQ itr1
- JG sealAVX2Tail128LoopA
- DECQ itr2
- JGE sealAVX2Tail128LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA1
- VPADDD state1StoreAVX2, BB0, BB1
- VPADDD state2StoreAVX2, CC0, CC1
- VPADDD DD1, DD0, DD1
-
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ DECQ CX
+ JG sealAVX2Tail128LoopA
+ DECQ R9
+ JGE sealAVX2Tail128LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y5
+ VPADDD 32(BP), Y14, Y9
+ VPADDD 64(BP), Y12, Y13
+ VPADDD Y1, Y4, Y1
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
JMP sealAVX2ShortSealLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
sealAVX2Tail256:
- // Need to decrypt up to 256 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA DD0, TT1
- VMOVDQA DD1, TT2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA ·chacha20Constants<>+0(SB), Y5
+ VMOVDQA 32(BP), Y14
+ VMOVDQA 32(BP), Y9
+ VMOVDQA 64(BP), Y12
+ VMOVDQA 64(BP), Y13
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
sealAVX2Tail256LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail256LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr1
- JG sealAVX2Tail256LoopA
- DECQ itr2
- JGE sealAVX2Tail256LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPERM2I128 $0x02, CC0, DD0, TT1
- VPERM2I128 $0x13, AA0, BB0, TT2
- VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- MOVQ $128, itr1
- LEAQ 128(inp), inp
- SUBQ $128, inl
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
-
- JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ CX
+ JG sealAVX2Tail256LoopA
+ DECQ R9
+ JGE sealAVX2Tail256LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ MOVQ $0x00000080, CX
+ LEAQ 128(SI), SI
+ SUBQ $0x80, BX
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ JMP sealAVX2SealHash
+
sealAVX2Tail384:
- // Need to decrypt up to 384 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
+ VMOVDQA Y2, Y15
sealAVX2Tail384LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail384LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr1
- JG sealAVX2Tail384LoopA
- DECQ itr2
- JGE sealAVX2Tail384LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPERM2I128 $0x02, CC0, DD0, TT1
- VPERM2I128 $0x13, AA0, BB0, TT2
- VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, TT0
- VPERM2I128 $0x02, CC1, DD1, TT1
- VPERM2I128 $0x13, AA1, BB1, TT2
- VPERM2I128 $0x13, CC1, DD1, TT3
- VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
- VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
- MOVQ $256, itr1
- LEAQ 256(inp), inp
- SUBQ $256, inl
- VPERM2I128 $0x02, AA2, BB2, AA0
- VPERM2I128 $0x02, CC2, DD2, BB0
- VPERM2I128 $0x13, AA2, BB2, CC0
- VPERM2I128 $0x13, CC2, DD2, DD0
-
- JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ CX
+ JG sealAVX2Tail384LoopA
+ DECQ R9
+ JGE sealAVX2Tail384LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPADDD Y15, Y2, Y2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y3
+ VPERM2I128 $0x02, Y13, Y1, Y7
+ VPERM2I128 $0x13, Y5, Y9, Y11
+ VPERM2I128 $0x13, Y13, Y1, Y15
+ VPXOR 128(SI), Y3, Y3
+ VPXOR 160(SI), Y7, Y7
+ VPXOR 192(SI), Y11, Y11
+ VPXOR 224(SI), Y15, Y15
+ VMOVDQU Y3, 128(DI)
+ VMOVDQU Y7, 160(DI)
+ VMOVDQU Y11, 192(DI)
+ VMOVDQU Y15, 224(DI)
+ MOVQ $0x00000100, CX
+ LEAQ 256(SI), SI
+ SUBQ $0x00000100, BX
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ JMP sealAVX2SealHash
+
sealAVX2Tail512:
- // Need to decrypt up to 512 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
sealAVX2Tail512LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail512LoopB:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyAdd(0*8(oup))
- polyMulAVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ (4*8)(oup), oup
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-
- DECQ itr1
- JG sealAVX2Tail512LoopA
- DECQ itr2
- JGE sealAVX2Tail512LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3
- VPXOR (0*32)(inp), CC3, CC3
- VMOVDQU CC3, (0*32)(oup)
- VPERM2I128 $0x02, CC0, DD0, CC3
- VPXOR (1*32)(inp), CC3, CC3
- VMOVDQU CC3, (1*32)(oup)
- VPERM2I128 $0x13, AA0, BB0, CC3
- VPXOR (2*32)(inp), CC3, CC3
- VMOVDQU CC3, (2*32)(oup)
- VPERM2I128 $0x13, CC0, DD0, CC3
- VPXOR (3*32)(inp), CC3, CC3
- VMOVDQU CC3, (3*32)(oup)
-
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
- VPERM2I128 $0x02, AA2, BB2, AA0
- VPERM2I128 $0x02, CC2, DD2, BB0
- VPERM2I128 $0x13, AA2, BB2, CC0
- VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-
- MOVQ $384, itr1
- LEAQ 384(inp), inp
- SUBQ $384, inl
- VPERM2I128 $0x02, AA3, BB3, AA0
- VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
- VPERM2I128 $0x13, AA3, BB3, CC0
- VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
- JMP sealAVX2SealHash
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ CX
+ JG sealAVX2Tail512LoopA
+ DECQ R9
+ JGE sealAVX2Tail512LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPXOR (SI), Y15, Y15
+ VMOVDQU Y15, (DI)
+ VPERM2I128 $0x02, Y12, Y4, Y15
+ VPXOR 32(SI), Y15, Y15
+ VMOVDQU Y15, 32(DI)
+ VPERM2I128 $0x13, Y0, Y14, Y15
+ VPXOR 64(SI), Y15, Y15
+ VMOVDQU Y15, 64(DI)
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ MOVQ $0x00000180, CX
+ LEAQ 384(SI), SI
+ SUBQ $0x00000180, BX
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ JMP sealAVX2SealHash