--- /dev/null
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
+// The implementation uses some optimization as described in:
+// [1] Gueron, S., Kounavis, M.E.: IntelĀ® Carry-Less Multiplication
+// Instruction and its Usage for Computing the GCM Mode rev. 2.02
+// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
+// Hardware
+
+package main
+
+import (
+ . "github.com/mmcloughlin/avo/build"
+ "github.com/mmcloughlin/avo/ir"
+ . "github.com/mmcloughlin/avo/operand"
+ . "github.com/mmcloughlin/avo/reg"
+)
+
+//go:generate go run . -out ../../gcm_amd64.s -pkg aes
+
+var (
+ B0 VecPhysical = X0
+ B1 = X1
+ B2 = X2
+ B3 = X3
+ B4 = X4
+ B5 = X5
+ B6 = X6
+ B7 = X7
+
+ ACC0 VecPhysical = X8
+ ACC1 = X9
+ ACCM = X10
+
+ T0 VecPhysical = X11
+ T1 = X12
+ T2 = X13
+ POLY = X14
+ BSWAP = X15
+)
+
+func main() {
+ Package("crypto/aes")
+ ConstraintExpr("!purego")
+
+ gcmAesFinish()
+ gcmAesInit()
+ gcmAesData()
+ gcmAesEnc()
+ gcmAesDec()
+
+ Generate()
+}
+
+func gcmAesFinish() {
+ Implement("gcmAesFinish")
+ Attributes(NOSPLIT)
+ AllocLocal(0)
+
+ var (
+ pTbl GPPhysical = RDI
+ tMsk = RSI
+ tPtr = RDX
+ plen = RAX
+ dlen = RCX
+ )
+
+ Load(Param("productTable"), pTbl)
+ Load(Param("tagMask"), tMsk)
+ Load(Param("T"), tPtr)
+ Load(Param("pLen"), plen)
+ Load(Param("dLen"), dlen)
+
+ MOVOU(Mem{Base: tPtr}, ACC0)
+ MOVOU(Mem{Base: tMsk}, T2)
+
+ bswapMask := bswapMask_DATA()
+ gcmPoly := gcmPoly_DATA()
+ MOVOU(bswapMask, BSWAP)
+ MOVOU(gcmPoly, POLY)
+
+ SHLQ(Imm(3), plen)
+ SHLQ(Imm(3), dlen)
+
+ MOVQ(plen, B0)
+ PINSRQ(Imm(1), dlen, B0)
+
+ PXOR(ACC0, B0)
+
+ MOVOU(Mem{Base: pTbl}.Offset(16*14), ACC0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
+ MOVOU(ACC0, ACC1)
+
+ PCLMULQDQ(Imm(0x00), B0, ACC0)
+ PCLMULQDQ(Imm(0x11), B0, ACC1)
+ PSHUFD(Imm(78), B0, T0)
+ PXOR(B0, T0)
+ PCLMULQDQ(Imm(0x00), T0, ACCM)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ MOVOU(POLY, T0)
+ PCLMULQDQ(Imm(0x01), ACC0, T0)
+ PSHUFD(Imm(78), ACC0, ACC0)
+ PXOR(T0, ACC0)
+
+ MOVOU(POLY, T0)
+ PCLMULQDQ(Imm(0x01), ACC0, T0)
+ PSHUFD(Imm(78), ACC0, ACC0)
+ PXOR(T0, ACC0)
+
+ PXOR(ACC1, ACC0)
+
+ PSHUFB(BSWAP, ACC0)
+ PXOR(T2, ACC0)
+ MOVOU(ACC0, Mem{Base: tPtr})
+
+ RET()
+}
+
+func gcmAesInit() {
+ Implement("gcmAesInit")
+ Attributes(NOSPLIT)
+ AllocLocal(0)
+
+ var (
+ dst GPPhysical = RDI
+ KS = RSI
+ NR = RDX
+ )
+
+ Load(Param("productTable"), dst)
+ Load(Param("ks").Base(), KS)
+ Load(Param("ks").Len(), NR)
+
+ SHRQ(Imm(2), NR)
+ DECQ(NR)
+
+ bswapMask := bswapMask_DATA()
+ gcmPoly := gcmPoly_DATA()
+ MOVOU(bswapMask, BSWAP)
+ MOVOU(gcmPoly, POLY)
+
+ Comment("Encrypt block 0, with the AES key to generate the hash key H")
+ MOVOU(Mem{Base: KS}.Offset(16*0), B0)
+ MOVOU(Mem{Base: KS}.Offset(16*1), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*2), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*3), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*4), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*5), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*6), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*7), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*8), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*9), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("initEncLast"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*11), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*12), T0)
+ JE(LabelRef("initEncLast"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*13), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: KS}.Offset(16*14), T0)
+
+ initEncLast(dst)
+ initLoop(dst)
+
+ RET()
+}
+
+func initEncLast(dst GPPhysical) {
+ Label("initEncLast")
+ AESENCLAST(T0, B0)
+
+ PSHUFB(BSWAP, B0)
+ Comment("H * 2")
+ PSHUFD(Imm(0xff), B0, T0)
+ MOVOU(B0, T1)
+ PSRAL(Imm(31), T0)
+ PAND(POLY, T0)
+ PSRLL(Imm(31), T1)
+ PSLLDQ(Imm(4), T1)
+ PSLLL(Imm(1), B0)
+ PXOR(T0, B0)
+ PXOR(T1, B0)
+ Comment("Karatsuba pre-computations")
+ MOVOU(B0, Mem{Base: dst}.Offset(16*14))
+ PSHUFD(Imm(78), B0, B1)
+ PXOR(B0, B1)
+ MOVOU(B1, Mem{Base: dst}.Offset(16*15))
+
+ MOVOU(B0, B2)
+ MOVOU(B1, B3)
+ Comment("Now prepare powers of H and pre-computations for them")
+ MOVQ(U32(7), RAX)
+}
+
+func initLoop(dst GPPhysical) {
+ Label("initLoop")
+ MOVOU(B2, T0)
+ MOVOU(B2, T1)
+ MOVOU(B3, T2)
+ PCLMULQDQ(Imm(0x00), B0, T0)
+ PCLMULQDQ(Imm(0x11), B0, T1)
+ PCLMULQDQ(Imm(0x00), B1, T2)
+
+ PXOR(T0, T2)
+ PXOR(T1, T2)
+ MOVOU(T2, B4)
+ PSLLDQ(Imm(8), B4)
+ PSRLDQ(Imm(8), T2)
+ PXOR(B4, T0)
+ PXOR(T2, T1)
+
+ MOVOU(POLY, B2)
+ PCLMULQDQ(Imm(0x01), T0, B2)
+ PSHUFD(Imm(78), T0, T0)
+ PXOR(B2, T0)
+ MOVOU(POLY, B2)
+ PCLMULQDQ(Imm(0x01), T0, B2)
+ PSHUFD(Imm(78), T0, T0)
+ PXOR(T0, B2)
+ PXOR(T1, B2)
+
+ MOVOU(B2, Mem{Base: dst}.Offset(16*12))
+ PSHUFD(Imm(78), B2, B3)
+ PXOR(B2, B3)
+ MOVOU(B3, Mem{Base: dst}.Offset(16*13))
+
+ DECQ(RAX)
+ LEAQ(Mem{Base: dst}.Offset(-16*2), dst)
+ JNE(LabelRef("initLoop"))
+}
+
+func gcmAesData() {
+ Implement("gcmAesData")
+ Attributes(NOSPLIT)
+ AllocLocal(0)
+
+ var (
+ pTbl GPPhysical = RDI
+ aut = RSI
+ tPtr = RCX
+ autLen = RDX
+ )
+
+ Load(Param("productTable"), pTbl)
+ Load(Param("data").Base(), aut)
+ Load(Param("data").Len(), autLen)
+ Load(Param("T"), tPtr)
+
+ bswapMask := bswapMask_DATA()
+ gcmPoly := gcmPoly_DATA()
+ PXOR(ACC0, ACC0)
+ MOVOU(bswapMask, BSWAP)
+ MOVOU(gcmPoly, POLY)
+
+ TESTQ(autLen, autLen)
+ JEQ(LabelRef("dataBail"))
+
+ CMPQ(autLen, Imm(13)) // optimize the TLS case
+ JE(LabelRef("dataTLS"))
+ CMPQ(autLen, Imm(128))
+ JB(LabelRef("startSinglesLoop"))
+ JMP(LabelRef("dataOctaLoop"))
+
+ dataTLS(pTbl, aut, autLen)
+ dataOctaLoop(pTbl, aut, autLen)
+ startSinglesLoop(pTbl)
+ dataSinglesLoop(aut, autLen)
+ dataMul(aut)
+ dataEnd(aut, autLen)
+ dataLoadLoop(aut, autLen)
+ dataBail(tPtr)
+}
+
+func reduceRound(a VecPhysical) {
+ MOVOU(POLY, T0)
+ PCLMULQDQ(Imm(0x01), a, T0)
+ PSHUFD(Imm(78), a, a)
+ PXOR(T0, a)
+}
+
+func mulRoundAAD(X VecPhysical, i int, pTbl GPPhysical) {
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
+ MOVOU(T1, T2)
+ PCLMULQDQ(Imm(0x00), X, T1)
+ PXOR(T1, ACC0)
+ PCLMULQDQ(Imm(0x11), X, T2)
+ PXOR(T2, ACC1)
+ PSHUFD(Imm(78), X, T1)
+ PXOR(T1, X)
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T1)
+ PCLMULQDQ(Imm(0x00), X, T1)
+ PXOR(T1, ACCM)
+}
+
+func dataTLS(pTbl, aut, autLen GPPhysical) {
+ Label("dataTLS")
+ MOVOU(Mem{Base: pTbl}.Offset(16*14), T1)
+ MOVOU(Mem{Base: pTbl}.Offset(16*15), T2)
+ PXOR(B0, B0)
+ MOVQ(Mem{Base: aut}, B0)
+ PINSRD(Imm(2), Mem{Base: aut}.Offset(8), B0)
+ PINSRB(Imm(12), Mem{Base: aut}.Offset(12), B0)
+ XORQ(autLen, autLen)
+ JMP(LabelRef("dataMul"))
+}
+
+func dataOctaLoop(pTbl, aut, autLen GPPhysical) {
+ Label("dataOctaLoop")
+ CMPQ(autLen, Imm(128))
+ JB(LabelRef("startSinglesLoop"))
+ SUBQ(Imm(128), autLen)
+
+ MOVOU(Mem{Base: aut}.Offset(16*0), X0)
+ MOVOU(Mem{Base: aut}.Offset(16*1), X1)
+ MOVOU(Mem{Base: aut}.Offset(16*2), X2)
+ MOVOU(Mem{Base: aut}.Offset(16*3), X3)
+ MOVOU(Mem{Base: aut}.Offset(16*4), X4)
+ MOVOU(Mem{Base: aut}.Offset(16*5), X5)
+ MOVOU(Mem{Base: aut}.Offset(16*6), X6)
+ MOVOU(Mem{Base: aut}.Offset(16*7), X7)
+ LEAQ(Mem{Base: aut}.Offset(16*8), aut)
+ PSHUFB(BSWAP, X0)
+ PSHUFB(BSWAP, X1)
+ PSHUFB(BSWAP, X2)
+ PSHUFB(BSWAP, X3)
+ PSHUFB(BSWAP, X4)
+ PSHUFB(BSWAP, X5)
+ PSHUFB(BSWAP, X6)
+ PSHUFB(BSWAP, X7)
+ PXOR(ACC0, X0)
+
+ MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
+ MOVOU(ACC0, ACC1)
+ PSHUFD(Imm(78), X0, T1)
+ PXOR(X0, T1)
+ PCLMULQDQ(Imm(0x00), X0, ACC0)
+ PCLMULQDQ(Imm(0x11), X0, ACC1)
+ PCLMULQDQ(Imm(0x00), T1, ACCM)
+
+ mulRoundAAD(X1, 1, pTbl)
+ mulRoundAAD(X2, 2, pTbl)
+ mulRoundAAD(X3, 3, pTbl)
+ mulRoundAAD(X4, 4, pTbl)
+ mulRoundAAD(X5, 5, pTbl)
+ mulRoundAAD(X6, 6, pTbl)
+ mulRoundAAD(X7, 7, pTbl)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+ JMP(LabelRef("dataOctaLoop"))
+}
+
+func startSinglesLoop(pTbl GPPhysical) {
+ Label("startSinglesLoop")
+ MOVOU(Mem{Base: pTbl}.Offset(16*14), T1)
+ MOVOU(Mem{Base: pTbl}.Offset(16*15), T2)
+
+}
+
+func dataSinglesLoop(aut, autLen GPPhysical) {
+ Label("dataSinglesLoop")
+
+ CMPQ(autLen, Imm(16))
+ JB(LabelRef("dataEnd"))
+ SUBQ(Imm(16), autLen)
+
+ MOVOU(Mem{Base: aut}, B0)
+}
+
+func dataMul(aut GPPhysical) {
+ Label("dataMul")
+ PSHUFB(BSWAP, B0)
+ PXOR(ACC0, B0)
+
+ MOVOU(T1, ACC0)
+ MOVOU(T2, ACCM)
+ MOVOU(T1, ACC1)
+
+ PSHUFD(Imm(78), B0, T0)
+ PXOR(B0, T0)
+ PCLMULQDQ(Imm(0x00), B0, ACC0)
+ PCLMULQDQ(Imm(0x11), B0, ACC1)
+ PCLMULQDQ(Imm(0x00), T0, ACCM)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ MOVOU(POLY, T0)
+ PCLMULQDQ(Imm(0x01), ACC0, T0)
+ PSHUFD(Imm(78), ACC0, ACC0)
+ PXOR(T0, ACC0)
+
+ MOVOU(POLY, T0)
+ PCLMULQDQ(Imm(0x01), ACC0, T0)
+ PSHUFD(Imm(78), ACC0, ACC0)
+ PXOR(T0, ACC0)
+ PXOR(ACC1, ACC0)
+
+ LEAQ(Mem{Base: aut}.Offset(16), aut)
+
+ JMP(LabelRef("dataSinglesLoop"))
+}
+
+func dataEnd(aut, autLen GPPhysical) {
+ Label("dataEnd")
+
+ TESTQ(autLen, autLen)
+ JEQ(LabelRef("dataBail"))
+
+ PXOR(B0, B0)
+ // LEAQ -1(aut)(autLen*1), aut
+ LEAQ(Mem{Base: aut, Index: autLen, Scale: 1}.Offset(-1), aut)
+}
+
+func dataLoadLoop(aut, autLen GPPhysical) {
+ Label("dataLoadLoop")
+
+ PSLLDQ(Imm(1), B0)
+ PINSRB(Imm(0), Mem{Base: aut}, B0)
+
+ LEAQ(Mem{Base: aut}.Offset(-1), aut)
+ DECQ(autLen)
+ JNE(LabelRef("dataLoadLoop"))
+
+ JMP(LabelRef("dataMul"))
+}
+
+func dataBail(tPtr GPPhysical) {
+ Label("dataBail")
+ MOVOU(ACC0, Mem{Base: tPtr})
+ RET()
+}
+
+func gcmAesEnc() {
+ Implement("gcmAesEnc")
+ Attributes(0)
+ AllocLocal(256)
+
+ var (
+ pTbl GPPhysical = RDI
+ ctx = RDX
+ ctrPtr = RCX
+ ptx = RSI
+ ks = RAX
+ tPtr = R8
+ ptxLen = R9
+ aluCTR = R10L
+ aluTMP = R11L
+ aluK = R12L
+ NR = R13
+ )
+
+ Load(Param("productTable"), pTbl)
+ Load(Param("dst").Base(), ctx)
+ Load(Param("src").Base(), ptx)
+ Load(Param("src").Len(), ptxLen)
+ Load(Param("ctr"), ctrPtr)
+ Load(Param("T"), tPtr)
+ Load(Param("ks").Base(), ks)
+ Load(Param("ks").Len(), NR)
+
+ SHRQ(Imm(2), NR)
+ DECQ(NR)
+
+ bswapMask := bswapMask_DATA()
+ gcmPoly := gcmPoly_DATA()
+ MOVOU(bswapMask, BSWAP)
+ MOVOU(gcmPoly, POLY)
+
+ MOVOU(Mem{Base: tPtr}, ACC0)
+ PXOR(ACC1, ACC1)
+ PXOR(ACCM, ACCM)
+ MOVOU(Mem{Base: ctrPtr}, B0)
+ MOVL(Mem{Base: ctrPtr}.Offset(3*4), aluCTR)
+ MOVOU(Mem{Base: ks}, T0)
+ MOVL(Mem{Base: ks}.Offset(3*4), aluK)
+ BSWAPL(aluCTR)
+ BSWAPL(aluK)
+
+ PXOR(B0, T0)
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+0*16))
+ incrementEnc(0, aluCTR, aluTMP, aluK)
+
+ CMPQ(ptxLen, Imm(128))
+ JB(LabelRef("gcmAesEncSingles"))
+ SUBQ(Imm(128), ptxLen)
+
+ Comment("We have at least 8 blocks to encrypt, prepare the rest of the counters")
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+1*16))
+ incrementEnc(1, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+2*16))
+ incrementEnc(2, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+3*16))
+ incrementEnc(3, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+4*16))
+ incrementEnc(4, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+5*16))
+ incrementEnc(5, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+6*16))
+ incrementEnc(6, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(8*16+7*16))
+ incrementEnc(7, aluCTR, aluTMP, aluK)
+
+ MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
+ MOVOU(Mem{Base: SP}.Offset(8*16+1*16), B1)
+ MOVOU(Mem{Base: SP}.Offset(8*16+2*16), B2)
+ MOVOU(Mem{Base: SP}.Offset(8*16+3*16), B3)
+ MOVOU(Mem{Base: SP}.Offset(8*16+4*16), B4)
+ MOVOU(Mem{Base: SP}.Offset(8*16+5*16), B5)
+ MOVOU(Mem{Base: SP}.Offset(8*16+6*16), B6)
+ MOVOU(Mem{Base: SP}.Offset(8*16+7*16), B7)
+
+ aesRound(1, ks)
+ incrementEnc(0, aluCTR, aluTMP, aluK)
+ aesRound(2, ks)
+ incrementEnc(1, aluCTR, aluTMP, aluK)
+ aesRound(3, ks)
+ incrementEnc(2, aluCTR, aluTMP, aluK)
+ aesRound(4, ks)
+ incrementEnc(3, aluCTR, aluTMP, aluK)
+ aesRound(5, ks)
+ incrementEnc(4, aluCTR, aluTMP, aluK)
+ aesRound(6, ks)
+ incrementEnc(5, aluCTR, aluTMP, aluK)
+ aesRound(7, ks)
+ incrementEnc(6, aluCTR, aluTMP, aluK)
+ aesRound(8, ks)
+ incrementEnc(7, aluCTR, aluTMP, aluK)
+ aesRound(9, ks)
+ MOVOU(Mem{Base: ks}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("encLast1"))
+ aesRnd(T0)
+ aesRound(11, ks)
+ MOVOU(Mem{Base: ks}.Offset(16*12), T0)
+ JE(LabelRef("encLast1"))
+ aesRnd(T0)
+ aesRound(13, ks)
+ MOVOU(Mem{Base: ks}.Offset(16*14), T0)
+
+ encLast1(ctx, ptx)
+ gcmAesEncOctetsLoop(pTbl, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
+ encLast2(ctx, ptx)
+ gcmAesEncOctetsEnd(pTbl, ptxLen, aluCTR)
+ gcmAesEncSingles(pTbl, ks)
+ gcmAesEncSinglesLoop(ks, ptxLen, aluCTR, aluTMP, aluK, NR)
+ encLast3(pTbl, ctx, ptx)
+ gcmAesEncTail(ks, ptxLen, NR)
+ encLast4(ptx, ptxLen, aluCTR, aluTMP)
+ ptxLoadLoop(pTbl, ctx, ptx, ptxLen)
+ gcmAesEncDone(tPtr)
+}
+
+func incrementEnc(i int, aluCTR, aluTMP, aluK GPPhysical) {
+ ADDL(Imm(1), aluCTR)
+ MOVL(aluCTR, aluTMP)
+ XORL(aluK, aluTMP)
+ BSWAPL(aluTMP)
+ MOVL(aluTMP, Mem{Base: SP}.Offset(3*4+8*16+i*16))
+}
+
+func aesRnd(k VecPhysical) {
+ AESENC(k, B0)
+ AESENC(k, B1)
+ AESENC(k, B2)
+ AESENC(k, B3)
+ AESENC(k, B4)
+ AESENC(k, B5)
+ AESENC(k, B6)
+ AESENC(k, B7)
+}
+
+func aesRound(i int, ks GPPhysical) {
+ // MOVOU (16*i)(ks), T0
+ MOVOU(Mem{Base: ks}.Offset(16*i), T0)
+ AESENC(T0, B0)
+ AESENC(T0, B1)
+ AESENC(T0, B2)
+ AESENC(T0, B3)
+ AESENC(T0, B4)
+ AESENC(T0, B5)
+ AESENC(T0, B6)
+ AESENC(T0, B7)
+}
+
+func aesRndLast(k VecPhysical) {
+ AESENCLAST(k, B0)
+ AESENCLAST(k, B1)
+ AESENCLAST(k, B2)
+ AESENCLAST(k, B3)
+ AESENCLAST(k, B4)
+ AESENCLAST(k, B5)
+ AESENCLAST(k, B6)
+ AESENCLAST(k, B7)
+}
+
+func combinedRound(i int, pTbl, ks GPPhysical) {
+ MOVOU(Mem{Base: ks}.Offset(16*i), T0)
+ AESENC(T0, B0)
+ AESENC(T0, B1)
+ AESENC(T0, B2)
+ AESENC(T0, B3)
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
+ MOVOU(T1, T2)
+ AESENC(T0, B4)
+ AESENC(T0, B5)
+ AESENC(T0, B6)
+ AESENC(T0, B7)
+ MOVOU(Mem{Base: SP}.Offset(16*i), T0)
+ PCLMULQDQ(Imm(0x00), T0, T1)
+ PXOR(T1, ACC0)
+ PSHUFD(Imm(78), T0, T1)
+ PCLMULQDQ(Imm(0x11), T0, T2)
+ PXOR(T1, T0)
+ PXOR(T2, ACC1)
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T2)
+ PCLMULQDQ(Imm(0x00), T2, T0)
+ PXOR(T0, ACCM)
+}
+
+func mulRound(i int, pTbl GPPhysical) {
+ MOVOU(Mem{Base: SP}.Offset(16*i), T0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
+ MOVOU(T1, T2)
+ PCLMULQDQ(Imm(0x00), T0, T1)
+ PXOR(T1, ACC0)
+ PCLMULQDQ(Imm(0x11), T0, T2)
+ PXOR(T2, ACC1)
+ PSHUFD(Imm(78), T0, T1)
+ PXOR(T1, T0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T1)
+ PCLMULQDQ(Imm(0x00), T0, T1)
+ PXOR(T1, ACCM)
+}
+
+func encLast1(ctx, ptx GPPhysical) {
+ Label("encLast1")
+ aesRndLast(T0)
+
+ MOVOU(Mem{Base: ptx}.Offset(16*0), T0)
+ PXOR(T0, B0)
+ MOVOU(Mem{Base: ptx}.Offset(16*1), T0)
+ PXOR(T0, B1)
+ MOVOU(Mem{Base: ptx}.Offset(16*2), T0)
+ PXOR(T0, B2)
+ MOVOU(Mem{Base: ptx}.Offset(16*3), T0)
+ PXOR(T0, B3)
+ MOVOU(Mem{Base: ptx}.Offset(16*4), T0)
+ PXOR(T0, B4)
+ MOVOU(Mem{Base: ptx}.Offset(16*5), T0)
+ PXOR(T0, B5)
+ MOVOU(Mem{Base: ptx}.Offset(16*6), T0)
+ PXOR(T0, B6)
+ MOVOU(Mem{Base: ptx}.Offset(16*7), T0)
+ PXOR(T0, B7)
+
+ MOVOU(B0, Mem{Base: ctx}.Offset(16*0))
+ PSHUFB(BSWAP, B0)
+ PXOR(ACC0, B0)
+ MOVOU(B1, Mem{Base: ctx}.Offset(16*1))
+ PSHUFB(BSWAP, B1)
+ MOVOU(B2, Mem{Base: ctx}.Offset(16*2))
+ PSHUFB(BSWAP, B2)
+ MOVOU(B3, Mem{Base: ctx}.Offset(16*3))
+ PSHUFB(BSWAP, B3)
+ MOVOU(B4, Mem{Base: ctx}.Offset(16*4))
+ PSHUFB(BSWAP, B4)
+ MOVOU(B5, Mem{Base: ctx}.Offset(16*5))
+ PSHUFB(BSWAP, B5)
+ MOVOU(B6, Mem{Base: ctx}.Offset(16*6))
+ PSHUFB(BSWAP, B6)
+ MOVOU(B7, Mem{Base: ctx}.Offset(16*7))
+ PSHUFB(BSWAP, B7)
+
+ MOVOU(B0, Mem{Base: SP}.Offset(16*0))
+ MOVOU(B1, Mem{Base: SP}.Offset(16*1))
+ MOVOU(B2, Mem{Base: SP}.Offset(16*2))
+ MOVOU(B3, Mem{Base: SP}.Offset(16*3))
+ MOVOU(B4, Mem{Base: SP}.Offset(16*4))
+ MOVOU(B5, Mem{Base: SP}.Offset(16*5))
+ MOVOU(B6, Mem{Base: SP}.Offset(16*6))
+ MOVOU(B7, Mem{Base: SP}.Offset(16*7))
+
+ LEAQ(Mem{Base: ptx}.Offset(128), ptx)
+ LEAQ(Mem{Base: ctx}.Offset(128), ctx)
+}
+
+func gcmAesEncOctetsLoop(pTbl, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
+ Label("gcmAesEncOctetsLoop")
+
+ CMPQ(ptxLen, Imm(128))
+ JB(LabelRef("gcmAesEncOctetsEnd"))
+ SUBQ(Imm(128), ptxLen)
+
+ MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
+ MOVOU(Mem{Base: SP}.Offset(8*16+1*16), B1)
+ MOVOU(Mem{Base: SP}.Offset(8*16+2*16), B2)
+ MOVOU(Mem{Base: SP}.Offset(8*16+3*16), B3)
+ MOVOU(Mem{Base: SP}.Offset(8*16+4*16), B4)
+ MOVOU(Mem{Base: SP}.Offset(8*16+5*16), B5)
+ MOVOU(Mem{Base: SP}.Offset(8*16+6*16), B6)
+ MOVOU(Mem{Base: SP}.Offset(8*16+7*16), B7)
+
+ MOVOU(Mem{Base: SP}.Offset(16*0), T0)
+ PSHUFD(Imm(78), T0, T1)
+ PXOR(T0, T1)
+
+ MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
+ MOVOU(ACC0, ACC1)
+
+ PCLMULQDQ(Imm(0x00), T1, ACCM)
+ PCLMULQDQ(Imm(0x00), T0, ACC0)
+ PCLMULQDQ(Imm(0x11), T0, ACC1)
+
+ combinedRound(1, pTbl, ks)
+ incrementEnc(0, aluCTR, aluTMP, aluK)
+ combinedRound(2, pTbl, ks)
+ incrementEnc(1, aluCTR, aluTMP, aluK)
+ combinedRound(3, pTbl, ks)
+ incrementEnc(2, aluCTR, aluTMP, aluK)
+ combinedRound(4, pTbl, ks)
+ incrementEnc(3, aluCTR, aluTMP, aluK)
+ combinedRound(5, pTbl, ks)
+ incrementEnc(4, aluCTR, aluTMP, aluK)
+ combinedRound(6, pTbl, ks)
+ incrementEnc(5, aluCTR, aluTMP, aluK)
+ combinedRound(7, pTbl, ks)
+ incrementEnc(6, aluCTR, aluTMP, aluK)
+
+ aesRound(8, ks)
+ incrementEnc(7, aluCTR, aluTMP, aluK)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ reduceRound(ACC0)
+ aesRound(9, ks)
+
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+
+ MOVOU(Mem{Base: ks}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("encLast2"))
+ aesRnd(T0)
+ aesRound(11, ks)
+ MOVOU(Mem{Base: ks}.Offset(16*12), T0)
+ JE(LabelRef("encLast2"))
+ aesRnd(T0)
+ aesRound(13, ks)
+ MOVOU(Mem{Base: ks}.Offset(16*14), T0)
+}
+
+func encLast2(ctx, ptx GPPhysical) {
+ Label("encLast2")
+ aesRndLast(T0)
+
+ MOVOU(Mem{Base: ptx}.Offset(16*0), T0)
+ PXOR(T0, B0)
+ MOVOU(Mem{Base: ptx}.Offset(16*1), T0)
+ PXOR(T0, B1)
+ MOVOU(Mem{Base: ptx}.Offset(16*2), T0)
+ PXOR(T0, B2)
+ MOVOU(Mem{Base: ptx}.Offset(16*3), T0)
+ PXOR(T0, B3)
+ MOVOU(Mem{Base: ptx}.Offset(16*4), T0)
+ PXOR(T0, B4)
+ MOVOU(Mem{Base: ptx}.Offset(16*5), T0)
+ PXOR(T0, B5)
+ MOVOU(Mem{Base: ptx}.Offset(16*6), T0)
+ PXOR(T0, B6)
+ MOVOU(Mem{Base: ptx}.Offset(16*7), T0)
+ PXOR(T0, B7)
+
+ MOVOU(B0, Mem{Base: ctx}.Offset(16*0))
+ PSHUFB(BSWAP, B0)
+ PXOR(ACC0, B0)
+ MOVOU(B1, Mem{Base: ctx}.Offset(16*1))
+ PSHUFB(BSWAP, B1)
+ MOVOU(B2, Mem{Base: ctx}.Offset(16*2))
+ PSHUFB(BSWAP, B2)
+ MOVOU(B3, Mem{Base: ctx}.Offset(16*3))
+ PSHUFB(BSWAP, B3)
+ MOVOU(B4, Mem{Base: ctx}.Offset(16*4))
+ PSHUFB(BSWAP, B4)
+ MOVOU(B5, Mem{Base: ctx}.Offset(16*5))
+ PSHUFB(BSWAP, B5)
+ MOVOU(B6, Mem{Base: ctx}.Offset(16*6))
+ PSHUFB(BSWAP, B6)
+ MOVOU(B7, Mem{Base: ctx}.Offset(16*7))
+ PSHUFB(BSWAP, B7)
+
+ MOVOU(B0, Mem{Base: SP}.Offset(16*0))
+ MOVOU(B1, Mem{Base: SP}.Offset(16*1))
+ MOVOU(B2, Mem{Base: SP}.Offset(16*2))
+ MOVOU(B3, Mem{Base: SP}.Offset(16*3))
+ MOVOU(B4, Mem{Base: SP}.Offset(16*4))
+ MOVOU(B5, Mem{Base: SP}.Offset(16*5))
+ MOVOU(B6, Mem{Base: SP}.Offset(16*6))
+ MOVOU(B7, Mem{Base: SP}.Offset(16*7))
+
+ LEAQ(Mem{Base: ptx}.Offset(128), ptx)
+ LEAQ(Mem{Base: ctx}.Offset(128), ctx)
+
+ JMP(LabelRef("gcmAesEncOctetsLoop"))
+}
+
+func gcmAesEncOctetsEnd(pTbl, ptxLen, aluCTR GPPhysical) {
+ Label("gcmAesEncOctetsEnd")
+
+ MOVOU(Mem{Base: SP}.Offset(16*0), T0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
+ MOVOU(ACC0, ACC1)
+ PSHUFD(Imm(78), T0, T1)
+ PXOR(T0, T1)
+ PCLMULQDQ(Imm(0x00), T0, ACC0)
+ PCLMULQDQ(Imm(0x11), T0, ACC1)
+ PCLMULQDQ(Imm(0x00), T1, ACCM)
+
+ mulRound(1, pTbl)
+ mulRound(2, pTbl)
+ mulRound(3, pTbl)
+ mulRound(4, pTbl)
+ mulRound(5, pTbl)
+ mulRound(6, pTbl)
+ mulRound(7, pTbl)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+
+ TESTQ(ptxLen, ptxLen)
+ JE(LabelRef("gcmAesEncDone"))
+
+ // Hack to get Avo to emit:
+ // SUBQ $7, aluCTR`
+ Instruction(&ir.Instruction{Opcode: "SUBQ", Operands: []Op{Imm(7), aluCTR}})
+}
+
+func gcmAesEncSingles(pTbl, ks GPPhysical) {
+ Label("gcmAesEncSingles")
+
+ MOVOU(Mem{Base: ks}.Offset(16*1), B1)
+ MOVOU(Mem{Base: ks}.Offset(16*2), B2)
+ MOVOU(Mem{Base: ks}.Offset(16*3), B3)
+ MOVOU(Mem{Base: ks}.Offset(16*4), B4)
+ MOVOU(Mem{Base: ks}.Offset(16*5), B5)
+ MOVOU(Mem{Base: ks}.Offset(16*6), B6)
+ MOVOU(Mem{Base: ks}.Offset(16*7), B7)
+
+ MOVOU(Mem{Base: pTbl}.Offset(16*14), T2)
+}
+
+func gcmAesEncSinglesLoop(ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
+ Label("gcmAesEncSinglesLoop")
+
+ CMPQ(ptxLen, Imm(16))
+ JB(LabelRef("gcmAesEncTail"))
+ SUBQ(Imm(16), ptxLen)
+
+ MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
+ incrementEnc(0, aluCTR, aluTMP, aluK)
+
+ AESENC(B1, B0)
+ AESENC(B2, B0)
+ AESENC(B3, B0)
+ AESENC(B4, B0)
+ AESENC(B5, B0)
+ AESENC(B6, B0)
+ AESENC(B7, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*8), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*9), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("encLast3"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*11), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*12), T0)
+ JE(LabelRef("encLast3"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*13), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*14), T0)
+}
+
+func encLast3(pTbl, ctx, ptx GPPhysical) {
+ Label("encLast3")
+ AESENCLAST(T0, B0)
+
+ MOVOU(Mem{Base: ptx}, T0)
+ PXOR(T0, B0)
+ MOVOU(B0, Mem{Base: ctx})
+
+ PSHUFB(BSWAP, B0)
+ PXOR(ACC0, B0)
+
+ MOVOU(T2, ACC0)
+ MOVOU(T2, ACC1)
+ MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
+
+ PSHUFD(Imm(78), B0, T0)
+ PXOR(B0, T0)
+ PCLMULQDQ(Imm(0x00), B0, ACC0)
+ PCLMULQDQ(Imm(0x11), B0, ACC1)
+ PCLMULQDQ(Imm(0x00), T0, ACCM)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+
+ LEAQ(Mem{Base: ptx}.Offset(16*1), ptx)
+ LEAQ(Mem{Base: ctx}.Offset(16*1), ctx)
+
+ JMP(LabelRef("gcmAesEncSinglesLoop"))
+}
+
+func gcmAesEncTail(ks, ptxLen, NR GPPhysical) {
+ Label("gcmAesEncTail")
+ TESTQ(ptxLen, ptxLen)
+ JE(LabelRef("gcmAesEncDone"))
+
+ MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
+ AESENC(B1, B0)
+ AESENC(B2, B0)
+ AESENC(B3, B0)
+ AESENC(B4, B0)
+ AESENC(B5, B0)
+ AESENC(B6, B0)
+ AESENC(B7, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*8), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*9), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("encLast4"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*11), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*12), T0)
+ JE(LabelRef("encLast4"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*13), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*14), T0)
+}
+
+func encLast4(ptx, ptxLen, aluCTR, aluTMP GPPhysical) {
+ Label("encLast4")
+ AESENCLAST(T0, B0)
+ MOVOU(B0, T0)
+
+ LEAQ(Mem{Base: ptx, Index: ptxLen, Scale: 1}.Offset(-1), ptx)
+
+ // Hack to get Avo to emit:
+ // MOVQ ptxLen, aluTMP
+ Instruction(&ir.Instruction{Opcode: "MOVQ", Operands: []Op{ptxLen, aluTMP}})
+ // Hack to get Avo to emit:
+ // SHLQ $4, aluTMP
+ Instruction(&ir.Instruction{Opcode: "SHLQ", Operands: []Op{Imm(4), aluTMP}})
+
+ andMask := andMask_DATA()
+ // Hack to get Avo to emit:
+ // LEAQ andMask<>(SB), aluCTR
+ Instruction(&ir.Instruction{Opcode: "LEAQ", Operands: []Op{andMask, aluCTR}})
+ MOVOU(Mem{Base: aluCTR, Index: aluTMP, Scale: 1}.Offset(-16), T1)
+
+ PXOR(B0, B0)
+}
+
+func ptxLoadLoop(pTbl, ctx, ptx, ptxLen GPPhysical) {
+ Label("ptxLoadLoop")
+ PSLLDQ(Imm(1), B0)
+ PINSRB(Imm(0), Mem{Base: ptx}, B0)
+ LEAQ(Mem{Base: ptx}.Offset(-1), ptx)
+ DECQ(ptxLen)
+ JNE(LabelRef("ptxLoadLoop"))
+
+ PXOR(T0, B0)
+ PAND(T1, B0)
+ MOVOU(B0, Mem{Base: ctx})
+
+ PSHUFB(BSWAP, B0)
+ PXOR(ACC0, B0)
+
+ MOVOU(T2, ACC0)
+ MOVOU(T2, ACC1)
+ MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
+
+ PSHUFD(Imm(78), B0, T0)
+ PXOR(B0, T0)
+ PCLMULQDQ(Imm(0x00), B0, ACC0)
+ PCLMULQDQ(Imm(0x11), B0, ACC1)
+ PCLMULQDQ(Imm(0x00), T0, ACCM)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+}
+
+func gcmAesEncDone(tPtr GPPhysical) {
+ Label("gcmAesEncDone")
+ MOVOU(ACC0, Mem{Base: tPtr})
+ RET()
+}
+
+func gcmAesDec() {
+ Implement("gcmAesDec")
+ Attributes(0)
+ AllocLocal(128)
+
+ var (
+ pTbl GPPhysical = RDI
+ ctx = RDX
+ ctrPtr = RCX
+ ptx = RSI
+ ks = RAX
+ tPtr = R8
+ ptxLen = R9
+ aluCTR = R10L
+ aluTMP = R11L
+ aluK = R12L
+ NR = R13
+ )
+
+ Load(Param("productTable"), pTbl)
+ Load(Param("dst").Base(), ptx)
+ Load(Param("src").Base(), ctx)
+ Load(Param("src").Len(), ptxLen)
+ Load(Param("ctr"), ctrPtr)
+ Load(Param("T"), tPtr)
+ Load(Param("ks").Base(), ks)
+ Load(Param("ks").Len(), NR)
+
+ SHRQ(Imm(2), NR)
+ DECQ(NR)
+
+ bswapMask := bswapMask_DATA()
+ gcmPoly := gcmPoly_DATA()
+ MOVOU(bswapMask, BSWAP)
+ MOVOU(gcmPoly, POLY)
+
+ MOVOU(Mem{Base: tPtr}, ACC0)
+ PXOR(ACC1, ACC1)
+ PXOR(ACCM, ACCM)
+ MOVOU(Mem{Base: ctrPtr}, B0)
+ MOVL(Mem{Base: ctrPtr}.Offset(3*4), aluCTR)
+ MOVOU(Mem{Base: ks}, T0)
+ MOVL(Mem{Base: ks}.Offset(3*4), aluK)
+ BSWAPL(aluCTR)
+ BSWAPL(aluK)
+
+ PXOR(B0, T0)
+ MOVOU(T0, Mem{Base: SP}.Offset(0*16))
+ incrementDec(0, aluCTR, aluTMP, aluK)
+
+ CMPQ(ptxLen, Imm(128))
+ JB(LabelRef("gcmAesDecSingles"))
+
+ MOVOU(T0, Mem{Base: SP}.Offset(1*16))
+ incrementDec(1, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(2*16))
+ incrementDec(2, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(3*16))
+ incrementDec(3, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(4*16))
+ incrementDec(4, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(5*16))
+ incrementDec(5, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(6*16))
+ incrementDec(6, aluCTR, aluTMP, aluK)
+ MOVOU(T0, Mem{Base: SP}.Offset(7*16))
+ incrementDec(7, aluCTR, aluTMP, aluK)
+
+ gcmAesDecOctetsLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
+ decLast1(ctx, ptx)
+ gcmAesDecEndOctets(aluCTR)
+ gcmAesDecSingles(pTbl, ks)
+ gcmAesDecSinglesLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
+ decLast2(ctx, ptx)
+ gcmAesDecTail(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
+ decLast3()
+ ptxStoreLoop(ptx, ptxLen)
+ gcmAesDecDone(tPtr)
+}
+
+func incrementDec(i int, aluCTR, aluTMP, aluK GPPhysical) {
+ ADDL(Imm(1), aluCTR)
+ MOVL(aluCTR, aluTMP)
+ XORL(aluK, aluTMP)
+ BSWAPL(aluTMP)
+ MOVL(aluTMP, Mem{Base: SP}.Offset(3*4+i*16))
+}
+
+func combinedDecRound(i int, pTbl, ctx, ks GPPhysical) {
+ MOVOU(Mem{Base: ks}.Offset(16*i), T0)
+ AESENC(T0, B0)
+ AESENC(T0, B1)
+ AESENC(T0, B2)
+ AESENC(T0, B3)
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
+ MOVOU(T1, T2)
+ AESENC(T0, B4)
+ AESENC(T0, B5)
+ AESENC(T0, B6)
+ AESENC(T0, B7)
+ MOVOU(Mem{Base: ctx}.Offset(16*i), T0)
+ PSHUFB(BSWAP, T0)
+ PCLMULQDQ(Imm(0x00), T0, T1)
+ PXOR(T1, ACC0)
+ PSHUFD(Imm(78), T0, T1)
+ PCLMULQDQ(Imm(0x11), T0, T2)
+ PXOR(T1, T0)
+ PXOR(T2, ACC1)
+ MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T2)
+ PCLMULQDQ(Imm(0x00), T2, T0)
+ PXOR(T0, ACCM)
+}
+
+func gcmAesDecOctetsLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
+ Label("gcmAesDecOctetsLoop")
+
+ CMPQ(ptxLen, Imm(128))
+ JB(LabelRef("gcmAesDecEndOctets"))
+ SUBQ(Imm(128), ptxLen)
+
+ MOVOU(Mem{Base: SP}.Offset(0*16), B0)
+ MOVOU(Mem{Base: SP}.Offset(1*16), B1)
+ MOVOU(Mem{Base: SP}.Offset(2*16), B2)
+ MOVOU(Mem{Base: SP}.Offset(3*16), B3)
+ MOVOU(Mem{Base: SP}.Offset(4*16), B4)
+ MOVOU(Mem{Base: SP}.Offset(5*16), B5)
+ MOVOU(Mem{Base: SP}.Offset(6*16), B6)
+ MOVOU(Mem{Base: SP}.Offset(7*16), B7)
+
+ MOVOU(Mem{Base: ctx}.Offset(16*0), T0)
+ PSHUFB(BSWAP, T0)
+ PXOR(ACC0, T0)
+ PSHUFD(Imm(78), T0, T1)
+ PXOR(T0, T1)
+
+ MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
+ MOVOU(ACC0, ACC1)
+
+ PCLMULQDQ(Imm(0x00), T1, ACCM)
+ PCLMULQDQ(Imm(0x00), T0, ACC0)
+ PCLMULQDQ(Imm(0x11), T0, ACC1)
+
+ combinedDecRound(1, pTbl, ctx, ks)
+ incrementDec(0, aluCTR, aluTMP, aluK)
+ combinedDecRound(2, pTbl, ctx, ks)
+ incrementDec(1, aluCTR, aluTMP, aluK)
+ combinedDecRound(3, pTbl, ctx, ks)
+ incrementDec(2, aluCTR, aluTMP, aluK)
+ combinedDecRound(4, pTbl, ctx, ks)
+ incrementDec(3, aluCTR, aluTMP, aluK)
+ combinedDecRound(5, pTbl, ctx, ks)
+ incrementDec(4, aluCTR, aluTMP, aluK)
+ combinedDecRound(6, pTbl, ctx, ks)
+ incrementDec(5, aluCTR, aluTMP, aluK)
+ combinedDecRound(7, pTbl, ctx, ks)
+ incrementDec(6, aluCTR, aluTMP, aluK)
+
+ aesRound(8, ks)
+ incrementDec(7, aluCTR, aluTMP, aluK)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ reduceRound(ACC0)
+ aesRound(9, ks)
+
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+
+ MOVOU(Mem{Base: ks}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("decLast1"))
+ aesRnd(T0)
+ aesRound(11, ks)
+ MOVOU(Mem{Base: ks}.Offset(16*12), T0)
+ JE(LabelRef("decLast1"))
+ aesRnd(T0)
+ aesRound(13, ks)
+ MOVOU(Mem{Base: ks}.Offset(16*14), T0)
+}
+
+func decLast1(ctx, ptx GPPhysical) {
+ Label("decLast1")
+ aesRndLast(T0)
+
+ MOVOU(Mem{Base: ctx}.Offset(16*0), T0)
+ PXOR(T0, B0)
+ MOVOU(Mem{Base: ctx}.Offset(16*1), T0)
+ PXOR(T0, B1)
+ MOVOU(Mem{Base: ctx}.Offset(16*2), T0)
+ PXOR(T0, B2)
+ MOVOU(Mem{Base: ctx}.Offset(16*3), T0)
+ PXOR(T0, B3)
+ MOVOU(Mem{Base: ctx}.Offset(16*4), T0)
+ PXOR(T0, B4)
+ MOVOU(Mem{Base: ctx}.Offset(16*5), T0)
+ PXOR(T0, B5)
+ MOVOU(Mem{Base: ctx}.Offset(16*6), T0)
+ PXOR(T0, B6)
+ MOVOU(Mem{Base: ctx}.Offset(16*7), T0)
+ PXOR(T0, B7)
+
+ MOVOU(B0, Mem{Base: ptx}.Offset(16*0))
+ MOVOU(B1, Mem{Base: ptx}.Offset(16*1))
+ MOVOU(B2, Mem{Base: ptx}.Offset(16*2))
+ MOVOU(B3, Mem{Base: ptx}.Offset(16*3))
+ MOVOU(B4, Mem{Base: ptx}.Offset(16*4))
+ MOVOU(B5, Mem{Base: ptx}.Offset(16*5))
+ MOVOU(B6, Mem{Base: ptx}.Offset(16*6))
+ MOVOU(B7, Mem{Base: ptx}.Offset(16*7))
+
+ LEAQ(Mem{Base: ptx}.Offset(128), ptx)
+ LEAQ(Mem{Base: ctx}.Offset(128), ctx)
+
+ JMP(LabelRef("gcmAesDecOctetsLoop"))
+}
+
+func gcmAesDecEndOctets(aluCTR GPPhysical) {
+ Label("gcmAesDecEndOctets")
+ // Hack to make Avo emit:
+ // SUBQ $7, aluCTR
+ Instruction(&ir.Instruction{Opcode: "SUBQ", Operands: []Op{Imm(7), aluCTR}})
+}
+
+func gcmAesDecSingles(pTbl, ks GPPhysical) {
+ Label("gcmAesDecSingles")
+
+ MOVOU(Mem{Base: ks}.Offset(16*1), B1)
+ MOVOU(Mem{Base: ks}.Offset(16*2), B2)
+ MOVOU(Mem{Base: ks}.Offset(16*3), B3)
+ MOVOU(Mem{Base: ks}.Offset(16*4), B4)
+ MOVOU(Mem{Base: ks}.Offset(16*5), B5)
+ MOVOU(Mem{Base: ks}.Offset(16*6), B6)
+ MOVOU(Mem{Base: ks}.Offset(16*7), B7)
+
+ MOVOU(Mem{Base: pTbl}.Offset(16*14), T2)
+}
+
+func gcmAesDecSinglesLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
+ Label("gcmAesDecSinglesLoop")
+
+ CMPQ(ptxLen, Imm(16))
+ JB(LabelRef("gcmAesDecTail"))
+ SUBQ(Imm(16), ptxLen)
+
+ MOVOU(Mem{Base: ctx}, B0)
+ MOVOU(B0, T1)
+ PSHUFB(BSWAP, B0)
+ PXOR(ACC0, B0)
+
+ MOVOU(T2, ACC0)
+ MOVOU(T2, ACC1)
+ MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
+
+ PCLMULQDQ(Imm(0x00), B0, ACC0)
+ PCLMULQDQ(Imm(0x11), B0, ACC1)
+ PSHUFD(Imm(78), B0, T0)
+ PXOR(B0, T0)
+ PCLMULQDQ(Imm(0x00), T0, ACCM)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+
+ MOVOU(Mem{Base: SP}.Offset(0*16), B0)
+ incrementDec(0, aluCTR, aluTMP, aluK)
+ AESENC(B1, B0)
+ AESENC(B2, B0)
+ AESENC(B3, B0)
+ AESENC(B4, B0)
+ AESENC(B5, B0)
+ AESENC(B6, B0)
+ AESENC(B7, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*8), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*9), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("decLast2"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*11), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*12), T0)
+ JE(LabelRef("decLast2"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*13), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*14), T0)
+}
+
+func decLast2(ctx, ptx GPPhysical) {
+ Label("decLast2")
+ AESENCLAST(T0, B0)
+
+ PXOR(T1, B0)
+ MOVOU(B0, Mem{Base: ptx})
+
+ LEAQ(Mem{Base: ptx}.Offset(16*1), ptx)
+ LEAQ(Mem{Base: ctx}.Offset(16*1), ctx)
+
+ JMP(LabelRef("gcmAesDecSinglesLoop"))
+}
+
+func gcmAesDecTail(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
+ Label("gcmAesDecTail")
+
+ TESTQ(ptxLen, ptxLen)
+ JE(LabelRef("gcmAesDecDone"))
+
+ // Hack to get Avo to emit:
+ // MOVQ ptxLen, aluTMP
+ Instruction(&ir.Instruction{Opcode: "MOVQ", Operands: []Op{ptxLen, aluTMP}})
+ // Hack to get Avo to emit:
+ // SHLQ $4, aluTMP
+ Instruction(&ir.Instruction{Opcode: "SHLQ", Operands: []Op{Imm(4), aluTMP}})
+
+ andMask := andMask_DATA()
+ // Hack to get Avo to emit:
+ // LEAQ andMask<>(SB), aluCTR
+ Instruction(&ir.Instruction{Opcode: "LEAQ", Operands: []Op{andMask, aluCTR}})
+ MOVOU(Mem{Base: aluCTR, Index: aluTMP, Scale: 1}.Offset(-16), T1)
+
+ MOVOU(Mem{Base: ctx}, B0)
+ PAND(T1, B0)
+
+ MOVOU(B0, T1)
+ PSHUFB(BSWAP, B0)
+ PXOR(ACC0, B0)
+
+ MOVOU(Mem{Base: pTbl}.Offset(16*14), ACC0)
+ MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
+ MOVOU(ACC0, ACC1)
+
+ PCLMULQDQ(Imm(0x00), B0, ACC0)
+ PCLMULQDQ(Imm(0x11), B0, ACC1)
+ PSHUFD(Imm(78), B0, T0)
+ PXOR(B0, T0)
+ PCLMULQDQ(Imm(0x00), T0, ACCM)
+
+ PXOR(ACC0, ACCM)
+ PXOR(ACC1, ACCM)
+ MOVOU(ACCM, T0)
+ PSRLDQ(Imm(8), ACCM)
+ PSLLDQ(Imm(8), T0)
+ PXOR(ACCM, ACC1)
+ PXOR(T0, ACC0)
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR(ACC1, ACC0)
+
+ MOVOU(Mem{Base: SP}.Offset(0*16), B0)
+ incrementDec(0, aluCTR, aluTMP, aluK)
+ AESENC(B1, B0)
+ AESENC(B2, B0)
+ AESENC(B3, B0)
+ AESENC(B4, B0)
+ AESENC(B5, B0)
+ AESENC(B6, B0)
+ AESENC(B7, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*8), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*9), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*10), T0)
+ CMPQ(NR, Imm(12))
+ JB(LabelRef("decLast3"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*11), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*12), T0)
+ JE(LabelRef("decLast3"))
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*13), T0)
+ AESENC(T0, B0)
+ MOVOU(Mem{Base: ks}.Offset(16*14), T0)
+}
+
+func decLast3() {
+ Label("decLast3")
+ AESENCLAST(T0, B0)
+ PXOR(T1, B0)
+}
+
+func ptxStoreLoop(ptx, ptxLen GPPhysical) {
+ Label("ptxStoreLoop")
+ PEXTRB(Imm(0), B0, Mem{Base: ptx})
+ PSRLDQ(Imm(1), B0)
+ LEAQ(Mem{Base: ptx}.Offset(1), ptx)
+ DECQ(ptxLen)
+
+ JNE(LabelRef("ptxStoreLoop"))
+}
+
+func gcmAesDecDone(tPtr GPPhysical) {
+ Label("gcmAesDecDone")
+ MOVOU(ACC0, Mem{Base: tPtr})
+ RET()
+}
+
+// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
+
+var bswapMask_DATA_ptr, gcmPoly_DATA_ptr, andMask_DATA_ptr *Mem
+
+func bswapMask_DATA() Mem {
+ if bswapMask_DATA_ptr != nil {
+ return *bswapMask_DATA_ptr
+ }
+
+ bswapMask := GLOBL("bswapMask", NOPTR|RODATA)
+ bswapMask_DATA_ptr = &bswapMask
+ DATA(0x00, U64(0x08090a0b0c0d0e0f))
+ DATA(0x08, U64(0x0001020304050607))
+
+ return bswapMask
+}
+
+func gcmPoly_DATA() Mem {
+ if gcmPoly_DATA_ptr != nil {
+ return *gcmPoly_DATA_ptr
+ }
+
+ gcmPoly := GLOBL("gcmPoly", NOPTR|RODATA)
+ gcmPoly_DATA_ptr = &gcmPoly
+ DATA(0x00, U64(0x0000000000000001))
+ DATA(0x08, U64(0xc200000000000000))
+
+ return gcmPoly
+}
+
+var andMask_K = [30]uint64{
+ 0x00000000000000ff,
+ 0x0000000000000000,
+ 0x000000000000ffff,
+ 0x0000000000000000,
+ 0x0000000000ffffff,
+ 0x0000000000000000,
+ 0x00000000ffffffff,
+ 0x0000000000000000,
+ 0x000000ffffffffff,
+ 0x0000000000000000,
+ 0x0000ffffffffffff,
+ 0x0000000000000000,
+ 0x00ffffffffffffff,
+ 0x0000000000000000,
+ 0xffffffffffffffff,
+ 0x0000000000000000,
+ 0xffffffffffffffff,
+ 0x00000000000000ff,
+ 0xffffffffffffffff,
+ 0x000000000000ffff,
+ 0xffffffffffffffff,
+ 0x0000000000ffffff,
+ 0xffffffffffffffff,
+ 0x00000000ffffffff,
+ 0xffffffffffffffff,
+ 0x000000ffffffffff,
+ 0xffffffffffffffff,
+ 0x0000ffffffffffff,
+ 0xffffffffffffffff,
+ 0x00ffffffffffffff,
+}
+
+func andMask_DATA() Mem {
+ if andMask_DATA_ptr != nil {
+ return *andMask_DATA_ptr
+ }
+ andMask := GLOBL("andMask", NOPTR|RODATA)
+ andMask_DATA_ptr = &andMask
+
+ for i, k := range andMask_K {
+ DATA(i*8, U64(k))
+ }
+
+ return andMask
+}
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run gcm_amd64_asm.go -out ../../gcm_amd64.s -pkg aes. DO NOT EDIT.
//go:build !purego
-// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
-// The implementation uses some optimization as described in:
-// [1] Gueron, S., Kounavis, M.E.: IntelĀ® Carry-Less Multiplication
-// Instruction and its Usage for Computing the GCM Mode rev. 2.02
-// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
-// Hardware
-
#include "textflag.h"
-#define B0 X0
-#define B1 X1
-#define B2 X2
-#define B3 X3
-#define B4 X4
-#define B5 X5
-#define B6 X6
-#define B7 X7
-
-#define ACC0 X8
-#define ACC1 X9
-#define ACCM X10
-
-#define T0 X11
-#define T1 X12
-#define T2 X13
-#define POLY X14
-#define BSWAP X15
-
-DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
-DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
-
-DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
-DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
-
-DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-
-GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
-GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
-GLOBL andMask<>(SB), (NOPTR+RODATA), $240
-
-// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
-TEXT Ā·gcmAesFinish(SB),NOSPLIT,$0
-#define pTbl DI
-#define tMsk SI
-#define tPtr DX
-#define plen AX
-#define dlen CX
-
- MOVQ productTable+0(FP), pTbl
- MOVQ tagMask+8(FP), tMsk
- MOVQ T+16(FP), tPtr
- MOVQ pLen+24(FP), plen
- MOVQ dLen+32(FP), dlen
-
- MOVOU (tPtr), ACC0
- MOVOU (tMsk), T2
-
- MOVOU bswapMask<>(SB), BSWAP
- MOVOU gcmPoly<>(SB), POLY
-
- SHLQ $3, plen
- SHLQ $3, dlen
-
- MOVQ plen, B0
- PINSRQ $1, dlen, B0
-
- PXOR ACC0, B0
-
- MOVOU (16*14)(pTbl), ACC0
- MOVOU (16*15)(pTbl), ACCM
- MOVOU ACC0, ACC1
-
- PCLMULQDQ $0x00, B0, ACC0
- PCLMULQDQ $0x11, B0, ACC1
- PSHUFD $78, B0, T0
- PXOR B0, T0
- PCLMULQDQ $0x00, T0, ACCM
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- MOVOU POLY, T0
- PCLMULQDQ $0x01, ACC0, T0
- PSHUFD $78, ACC0, ACC0
- PXOR T0, ACC0
-
- MOVOU POLY, T0
- PCLMULQDQ $0x01, ACC0, T0
- PSHUFD $78, ACC0, ACC0
- PXOR T0, ACC0
-
- PXOR ACC1, ACC0
-
- PSHUFB BSWAP, ACC0
- PXOR T2, ACC0
- MOVOU ACC0, (tPtr)
-
+// func gcmAesFinish(productTable *[256]byte, tagMask *[16]byte, T *[16]byte, pLen uint64, dLen uint64)
+// Requires: PCLMULQDQ, SSE2, SSE4.1, SSSE3
+TEXT Ā·gcmAesFinish(SB), NOSPLIT, $0-40
+ MOVQ productTable+0(FP), DI
+ MOVQ tagMask+8(FP), SI
+ MOVQ T+16(FP), DX
+ MOVQ pLen+24(FP), AX
+ MOVQ dLen+32(FP), CX
+ MOVOU (DX), X8
+ MOVOU (SI), X13
+ MOVOU bswapMask<>+0(SB), X15
+ MOVOU gcmPoly<>+0(SB), X14
+ SHLQ $0x03, AX
+ SHLQ $0x03, CX
+ MOVQ AX, X0
+ PINSRQ $0x01, CX, X0
+ PXOR X8, X0
+ MOVOU 224(DI), X8
+ MOVOU 240(DI), X10
+ MOVOU X8, X9
+ PCLMULQDQ $0x00, X0, X8
+ PCLMULQDQ $0x11, X0, X9
+ PSHUFD $0x4e, X0, X11
+ PXOR X0, X11
+ PCLMULQDQ $0x00, X11, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ PSHUFB X15, X8
+ PXOR X13, X8
+ MOVOU X8, (DX)
RET
-#undef pTbl
-#undef tMsk
-#undef tPtr
-#undef plen
-#undef dlen
-
-// func gcmAesInit(productTable *[256]byte, ks []uint32)
-TEXT Ā·gcmAesInit(SB),NOSPLIT,$0
-#define dst DI
-#define KS SI
-#define NR DX
- MOVQ productTable+0(FP), dst
- MOVQ ks_base+8(FP), KS
- MOVQ ks_len+16(FP), NR
+DATA bswapMask<>+0(SB)/8, $0x08090a0b0c0d0e0f
+DATA bswapMask<>+8(SB)/8, $0x0001020304050607
+GLOBL bswapMask<>(SB), RODATA|NOPTR, $16
- SHRQ $2, NR
- DECQ NR
+DATA gcmPoly<>+0(SB)/8, $0x0000000000000001
+DATA gcmPoly<>+8(SB)/8, $0xc200000000000000
+GLOBL gcmPoly<>(SB), RODATA|NOPTR, $16
- MOVOU bswapMask<>(SB), BSWAP
- MOVOU gcmPoly<>(SB), POLY
+// func gcmAesInit(productTable *[256]byte, ks []uint32)
+// Requires: AES, PCLMULQDQ, SSE2, SSSE3
+TEXT Ā·gcmAesInit(SB), NOSPLIT, $0-32
+ MOVQ productTable+0(FP), DI
+ MOVQ ks_base+8(FP), SI
+ MOVQ ks_len+16(FP), DX
+ SHRQ $0x02, DX
+ DECQ DX
+ MOVOU bswapMask<>+0(SB), X15
+ MOVOU gcmPoly<>+0(SB), X14
// Encrypt block 0, with the AES key to generate the hash key H
- MOVOU (16*0)(KS), B0
- MOVOU (16*1)(KS), T0
- AESENC T0, B0
- MOVOU (16*2)(KS), T0
- AESENC T0, B0
- MOVOU (16*3)(KS), T0
- AESENC T0, B0
- MOVOU (16*4)(KS), T0
- AESENC T0, B0
- MOVOU (16*5)(KS), T0
- AESENC T0, B0
- MOVOU (16*6)(KS), T0
- AESENC T0, B0
- MOVOU (16*7)(KS), T0
- AESENC T0, B0
- MOVOU (16*8)(KS), T0
- AESENC T0, B0
- MOVOU (16*9)(KS), T0
- AESENC T0, B0
- MOVOU (16*10)(KS), T0
- CMPQ NR, $12
- JB initEncLast
- AESENC T0, B0
- MOVOU (16*11)(KS), T0
- AESENC T0, B0
- MOVOU (16*12)(KS), T0
- JE initEncLast
- AESENC T0, B0
- MOVOU (16*13)(KS), T0
- AESENC T0, B0
- MOVOU (16*14)(KS), T0
+ MOVOU (SI), X0
+ MOVOU 16(SI), X11
+ AESENC X11, X0
+ MOVOU 32(SI), X11
+ AESENC X11, X0
+ MOVOU 48(SI), X11
+ AESENC X11, X0
+ MOVOU 64(SI), X11
+ AESENC X11, X0
+ MOVOU 80(SI), X11
+ AESENC X11, X0
+ MOVOU 96(SI), X11
+ AESENC X11, X0
+ MOVOU 112(SI), X11
+ AESENC X11, X0
+ MOVOU 128(SI), X11
+ AESENC X11, X0
+ MOVOU 144(SI), X11
+ AESENC X11, X0
+ MOVOU 160(SI), X11
+ CMPQ DX, $0x0c
+ JB initEncLast
+ AESENC X11, X0
+ MOVOU 176(SI), X11
+ AESENC X11, X0
+ MOVOU 192(SI), X11
+ JE initEncLast
+ AESENC X11, X0
+ MOVOU 208(SI), X11
+ AESENC X11, X0
+ MOVOU 224(SI), X11
+
initEncLast:
- AESENCLAST T0, B0
+ AESENCLAST X11, X0
+ PSHUFB X15, X0
- PSHUFB BSWAP, B0
// H * 2
- PSHUFD $0xff, B0, T0
- MOVOU B0, T1
- PSRAL $31, T0
- PAND POLY, T0
- PSRLL $31, T1
- PSLLDQ $4, T1
- PSLLL $1, B0
- PXOR T0, B0
- PXOR T1, B0
+ PSHUFD $0xff, X0, X11
+ MOVOU X0, X12
+ PSRAL $0x1f, X11
+ PAND X14, X11
+ PSRLL $0x1f, X12
+ PSLLDQ $0x04, X12
+ PSLLL $0x01, X0
+ PXOR X11, X0
+ PXOR X12, X0
+
// Karatsuba pre-computations
- MOVOU B0, (16*14)(dst)
- PSHUFD $78, B0, B1
- PXOR B0, B1
- MOVOU B1, (16*15)(dst)
+ MOVOU X0, 224(DI)
+ PSHUFD $0x4e, X0, X1
+ PXOR X0, X1
+ MOVOU X1, 240(DI)
+ MOVOU X0, X2
+ MOVOU X1, X3
- MOVOU B0, B2
- MOVOU B1, B3
// Now prepare powers of H and pre-computations for them
- MOVQ $7, AX
+ MOVQ $0x00000007, AX
initLoop:
- MOVOU B2, T0
- MOVOU B2, T1
- MOVOU B3, T2
- PCLMULQDQ $0x00, B0, T0
- PCLMULQDQ $0x11, B0, T1
- PCLMULQDQ $0x00, B1, T2
-
- PXOR T0, T2
- PXOR T1, T2
- MOVOU T2, B4
- PSLLDQ $8, B4
- PSRLDQ $8, T2
- PXOR B4, T0
- PXOR T2, T1
-
- MOVOU POLY, B2
- PCLMULQDQ $0x01, T0, B2
- PSHUFD $78, T0, T0
- PXOR B2, T0
- MOVOU POLY, B2
- PCLMULQDQ $0x01, T0, B2
- PSHUFD $78, T0, T0
- PXOR T0, B2
- PXOR T1, B2
-
- MOVOU B2, (16*12)(dst)
- PSHUFD $78, B2, B3
- PXOR B2, B3
- MOVOU B3, (16*13)(dst)
-
- DECQ AX
- LEAQ (-16*2)(dst), dst
- JNE initLoop
-
+ MOVOU X2, X11
+ MOVOU X2, X12
+ MOVOU X3, X13
+ PCLMULQDQ $0x00, X0, X11
+ PCLMULQDQ $0x11, X0, X12
+ PCLMULQDQ $0x00, X1, X13
+ PXOR X11, X13
+ PXOR X12, X13
+ MOVOU X13, X4
+ PSLLDQ $0x08, X4
+ PSRLDQ $0x08, X13
+ PXOR X4, X11
+ PXOR X13, X12
+ MOVOU X14, X2
+ PCLMULQDQ $0x01, X11, X2
+ PSHUFD $0x4e, X11, X11
+ PXOR X2, X11
+ MOVOU X14, X2
+ PCLMULQDQ $0x01, X11, X2
+ PSHUFD $0x4e, X11, X11
+ PXOR X11, X2
+ PXOR X12, X2
+ MOVOU X2, 192(DI)
+ PSHUFD $0x4e, X2, X3
+ PXOR X2, X3
+ MOVOU X3, 208(DI)
+ DECQ AX
+ LEAQ -32(DI), DI
+ JNE initLoop
RET
-#undef NR
-#undef KS
-#undef dst
// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
-TEXT Ā·gcmAesData(SB),NOSPLIT,$0
-#define pTbl DI
-#define aut SI
-#define tPtr CX
-#define autLen DX
-
-#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
-#define mulRoundAAD(X ,i) \
- MOVOU (16*(i*2))(pTbl), T1;\
- MOVOU T1, T2;\
- PCLMULQDQ $0x00, X, T1;\
- PXOR T1, ACC0;\
- PCLMULQDQ $0x11, X, T2;\
- PXOR T2, ACC1;\
- PSHUFD $78, X, T1;\
- PXOR T1, X;\
- MOVOU (16*(i*2+1))(pTbl), T1;\
- PCLMULQDQ $0x00, X, T1;\
- PXOR T1, ACCM
-
- MOVQ productTable+0(FP), pTbl
- MOVQ data_base+8(FP), aut
- MOVQ data_len+16(FP), autLen
- MOVQ T+32(FP), tPtr
-
- PXOR ACC0, ACC0
- MOVOU bswapMask<>(SB), BSWAP
- MOVOU gcmPoly<>(SB), POLY
-
- TESTQ autLen, autLen
- JEQ dataBail
-
- CMPQ autLen, $13 // optimize the TLS case
- JE dataTLS
- CMPQ autLen, $128
- JB startSinglesLoop
- JMP dataOctaLoop
+// Requires: PCLMULQDQ, SSE2, SSE4.1, SSSE3
+TEXT Ā·gcmAesData(SB), NOSPLIT, $0-40
+ MOVQ productTable+0(FP), DI
+ MOVQ data_base+8(FP), SI
+ MOVQ data_len+16(FP), DX
+ MOVQ T+32(FP), CX
+ PXOR X8, X8
+ MOVOU bswapMask<>+0(SB), X15
+ MOVOU gcmPoly<>+0(SB), X14
+ TESTQ DX, DX
+ JEQ dataBail
+ CMPQ DX, $0x0d
+ JE dataTLS
+ CMPQ DX, $0x80
+ JB startSinglesLoop
+ JMP dataOctaLoop
dataTLS:
- MOVOU (16*14)(pTbl), T1
- MOVOU (16*15)(pTbl), T2
- PXOR B0, B0
- MOVQ (aut), B0
- PINSRD $2, 8(aut), B0
- PINSRB $12, 12(aut), B0
- XORQ autLen, autLen
- JMP dataMul
+ MOVOU 224(DI), X12
+ MOVOU 240(DI), X13
+ PXOR X0, X0
+ MOVQ (SI), X0
+ PINSRD $0x02, 8(SI), X0
+ PINSRB $0x0c, 12(SI), X0
+ XORQ DX, DX
+ JMP dataMul
dataOctaLoop:
- CMPQ autLen, $128
- JB startSinglesLoop
- SUBQ $128, autLen
-
- MOVOU (16*0)(aut), X0
- MOVOU (16*1)(aut), X1
- MOVOU (16*2)(aut), X2
- MOVOU (16*3)(aut), X3
- MOVOU (16*4)(aut), X4
- MOVOU (16*5)(aut), X5
- MOVOU (16*6)(aut), X6
- MOVOU (16*7)(aut), X7
- LEAQ (16*8)(aut), aut
- PSHUFB BSWAP, X0
- PSHUFB BSWAP, X1
- PSHUFB BSWAP, X2
- PSHUFB BSWAP, X3
- PSHUFB BSWAP, X4
- PSHUFB BSWAP, X5
- PSHUFB BSWAP, X6
- PSHUFB BSWAP, X7
- PXOR ACC0, X0
-
- MOVOU (16*0)(pTbl), ACC0
- MOVOU (16*1)(pTbl), ACCM
- MOVOU ACC0, ACC1
- PSHUFD $78, X0, T1
- PXOR X0, T1
- PCLMULQDQ $0x00, X0, ACC0
- PCLMULQDQ $0x11, X0, ACC1
- PCLMULQDQ $0x00, T1, ACCM
-
- mulRoundAAD(X1, 1)
- mulRoundAAD(X2, 2)
- mulRoundAAD(X3, 3)
- mulRoundAAD(X4, 4)
- mulRoundAAD(X5, 5)
- mulRoundAAD(X6, 6)
- mulRoundAAD(X7, 7)
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
- reduceRound(ACC0)
- reduceRound(ACC0)
- PXOR ACC1, ACC0
- JMP dataOctaLoop
+ CMPQ DX, $0x80
+ JB startSinglesLoop
+ SUBQ $0x80, DX
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU 32(SI), X2
+ MOVOU 48(SI), X3
+ MOVOU 64(SI), X4
+ MOVOU 80(SI), X5
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X7
+ LEAQ 128(SI), SI
+ PSHUFB X15, X0
+ PSHUFB X15, X1
+ PSHUFB X15, X2
+ PSHUFB X15, X3
+ PSHUFB X15, X4
+ PSHUFB X15, X5
+ PSHUFB X15, X6
+ PSHUFB X15, X7
+ PXOR X8, X0
+ MOVOU (DI), X8
+ MOVOU 16(DI), X10
+ MOVOU X8, X9
+ PSHUFD $0x4e, X0, X12
+ PXOR X0, X12
+ PCLMULQDQ $0x00, X0, X8
+ PCLMULQDQ $0x11, X0, X9
+ PCLMULQDQ $0x00, X12, X10
+ MOVOU 32(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X1, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X1, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X1, X12
+ PXOR X12, X1
+ MOVOU 48(DI), X12
+ PCLMULQDQ $0x00, X1, X12
+ PXOR X12, X10
+ MOVOU 64(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X2, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X2, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X2, X12
+ PXOR X12, X2
+ MOVOU 80(DI), X12
+ PCLMULQDQ $0x00, X2, X12
+ PXOR X12, X10
+ MOVOU 96(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X3, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X3, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X3, X12
+ PXOR X12, X3
+ MOVOU 112(DI), X12
+ PCLMULQDQ $0x00, X3, X12
+ PXOR X12, X10
+ MOVOU 128(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X4, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X4, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X4, X12
+ PXOR X12, X4
+ MOVOU 144(DI), X12
+ PCLMULQDQ $0x00, X4, X12
+ PXOR X12, X10
+ MOVOU 160(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X5, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X5, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X5, X12
+ PXOR X12, X5
+ MOVOU 176(DI), X12
+ PCLMULQDQ $0x00, X5, X12
+ PXOR X12, X10
+ MOVOU 192(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X6, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X6, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X6, X12
+ PXOR X12, X6
+ MOVOU 208(DI), X12
+ PCLMULQDQ $0x00, X6, X12
+ PXOR X12, X10
+ MOVOU 224(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X7, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X7, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X7, X12
+ PXOR X12, X7
+ MOVOU 240(DI), X12
+ PCLMULQDQ $0x00, X7, X12
+ PXOR X12, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ JMP dataOctaLoop
startSinglesLoop:
- MOVOU (16*14)(pTbl), T1
- MOVOU (16*15)(pTbl), T2
+ MOVOU 224(DI), X12
+ MOVOU 240(DI), X13
dataSinglesLoop:
+ CMPQ DX, $0x10
+ JB dataEnd
+ SUBQ $0x10, DX
+ MOVOU (SI), X0
- CMPQ autLen, $16
- JB dataEnd
- SUBQ $16, autLen
-
- MOVOU (aut), B0
dataMul:
- PSHUFB BSWAP, B0
- PXOR ACC0, B0
-
- MOVOU T1, ACC0
- MOVOU T2, ACCM
- MOVOU T1, ACC1
-
- PSHUFD $78, B0, T0
- PXOR B0, T0
- PCLMULQDQ $0x00, B0, ACC0
- PCLMULQDQ $0x11, B0, ACC1
- PCLMULQDQ $0x00, T0, ACCM
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- MOVOU POLY, T0
- PCLMULQDQ $0x01, ACC0, T0
- PSHUFD $78, ACC0, ACC0
- PXOR T0, ACC0
-
- MOVOU POLY, T0
- PCLMULQDQ $0x01, ACC0, T0
- PSHUFD $78, ACC0, ACC0
- PXOR T0, ACC0
- PXOR ACC1, ACC0
-
- LEAQ 16(aut), aut
-
- JMP dataSinglesLoop
+ PSHUFB X15, X0
+ PXOR X8, X0
+ MOVOU X12, X8
+ MOVOU X13, X10
+ MOVOU X12, X9
+ PSHUFD $0x4e, X0, X11
+ PXOR X0, X11
+ PCLMULQDQ $0x00, X0, X8
+ PCLMULQDQ $0x11, X0, X9
+ PCLMULQDQ $0x00, X11, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ LEAQ 16(SI), SI
+ JMP dataSinglesLoop
dataEnd:
-
- TESTQ autLen, autLen
- JEQ dataBail
-
- PXOR B0, B0
- LEAQ -1(aut)(autLen*1), aut
+ TESTQ DX, DX
+ JEQ dataBail
+ PXOR X0, X0
+ LEAQ -1(SI)(DX*1), SI
dataLoadLoop:
-
- PSLLDQ $1, B0
- PINSRB $0, (aut), B0
-
- LEAQ -1(aut), aut
- DECQ autLen
- JNE dataLoadLoop
-
- JMP dataMul
+ PSLLDQ $0x01, X0
+ PINSRB $0x00, (SI), X0
+ LEAQ -1(SI), SI
+ DECQ DX
+ JNE dataLoadLoop
+ JMP dataMul
dataBail:
- MOVOU ACC0, (tPtr)
+ MOVOU X8, (CX)
RET
-#undef pTbl
-#undef aut
-#undef tPtr
-#undef autLen
-
-// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
-TEXT Ā·gcmAesEnc(SB),0,$256-96
-#define pTbl DI
-#define ctx DX
-#define ctrPtr CX
-#define ptx SI
-#define ks AX
-#define tPtr R8
-#define ptxLen R9
-#define aluCTR R10
-#define aluTMP R11
-#define aluK R12
-#define NR R13
-
-#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
-#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
-#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
-#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
-#define combinedRound(i) \
- MOVOU (16*i)(ks), T0;\
- AESENC T0, B0;\
- AESENC T0, B1;\
- AESENC T0, B2;\
- AESENC T0, B3;\
- MOVOU (16*(i*2))(pTbl), T1;\
- MOVOU T1, T2;\
- AESENC T0, B4;\
- AESENC T0, B5;\
- AESENC T0, B6;\
- AESENC T0, B7;\
- MOVOU (16*i)(SP), T0;\
- PCLMULQDQ $0x00, T0, T1;\
- PXOR T1, ACC0;\
- PSHUFD $78, T0, T1;\
- PCLMULQDQ $0x11, T0, T2;\
- PXOR T1, T0;\
- PXOR T2, ACC1;\
- MOVOU (16*(i*2+1))(pTbl), T2;\
- PCLMULQDQ $0x00, T2, T0;\
- PXOR T0, ACCM
-#define mulRound(i) \
- MOVOU (16*i)(SP), T0;\
- MOVOU (16*(i*2))(pTbl), T1;\
- MOVOU T1, T2;\
- PCLMULQDQ $0x00, T0, T1;\
- PXOR T1, ACC0;\
- PCLMULQDQ $0x11, T0, T2;\
- PXOR T2, ACC1;\
- PSHUFD $78, T0, T1;\
- PXOR T1, T0;\
- MOVOU (16*(i*2+1))(pTbl), T1;\
- PCLMULQDQ $0x00, T0, T1;\
- PXOR T1, ACCM
-
- MOVQ productTable+0(FP), pTbl
- MOVQ dst+8(FP), ctx
- MOVQ src_base+32(FP), ptx
- MOVQ src_len+40(FP), ptxLen
- MOVQ ctr+56(FP), ctrPtr
- MOVQ T+64(FP), tPtr
- MOVQ ks_base+72(FP), ks
- MOVQ ks_len+80(FP), NR
- SHRQ $2, NR
- DECQ NR
-
- MOVOU bswapMask<>(SB), BSWAP
- MOVOU gcmPoly<>(SB), POLY
-
- MOVOU (tPtr), ACC0
- PXOR ACC1, ACC1
- PXOR ACCM, ACCM
- MOVOU (ctrPtr), B0
- MOVL (3*4)(ctrPtr), aluCTR
- MOVOU (ks), T0
- MOVL (3*4)(ks), aluK
- BSWAPL aluCTR
- BSWAPL aluK
-
- PXOR B0, T0
- MOVOU T0, (8*16 + 0*16)(SP)
- increment(0)
-
- CMPQ ptxLen, $128
- JB gcmAesEncSingles
- SUBQ $128, ptxLen
+// func gcmAesEnc(productTable *[256]byte, dst []byte, src []byte, ctr *[16]byte, T *[16]byte, ks []uint32)
+// Requires: AES, PCLMULQDQ, SSE2, SSE4.1, SSSE3
+TEXT Ā·gcmAesEnc(SB), $256-96
+ MOVQ productTable+0(FP), DI
+ MOVQ dst_base+8(FP), DX
+ MOVQ src_base+32(FP), SI
+ MOVQ src_len+40(FP), R9
+ MOVQ ctr+56(FP), CX
+ MOVQ T+64(FP), R8
+ MOVQ ks_base+72(FP), AX
+ MOVQ ks_len+80(FP), R13
+ SHRQ $0x02, R13
+ DECQ R13
+ MOVOU bswapMask<>+0(SB), X15
+ MOVOU gcmPoly<>+0(SB), X14
+ MOVOU (R8), X8
+ PXOR X9, X9
+ PXOR X10, X10
+ MOVOU (CX), X0
+ MOVL 12(CX), R10
+ MOVOU (AX), X11
+ MOVL 12(AX), R12
+ BSWAPL R10
+ BSWAPL R12
+ PXOR X0, X11
+ MOVOU X11, 128(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 140(SP)
+ CMPQ R9, $0x80
+ JB gcmAesEncSingles
+ SUBQ $0x80, R9
// We have at least 8 blocks to encrypt, prepare the rest of the counters
- MOVOU T0, (8*16 + 1*16)(SP)
- increment(1)
- MOVOU T0, (8*16 + 2*16)(SP)
- increment(2)
- MOVOU T0, (8*16 + 3*16)(SP)
- increment(3)
- MOVOU T0, (8*16 + 4*16)(SP)
- increment(4)
- MOVOU T0, (8*16 + 5*16)(SP)
- increment(5)
- MOVOU T0, (8*16 + 6*16)(SP)
- increment(6)
- MOVOU T0, (8*16 + 7*16)(SP)
- increment(7)
-
- MOVOU (8*16 + 0*16)(SP), B0
- MOVOU (8*16 + 1*16)(SP), B1
- MOVOU (8*16 + 2*16)(SP), B2
- MOVOU (8*16 + 3*16)(SP), B3
- MOVOU (8*16 + 4*16)(SP), B4
- MOVOU (8*16 + 5*16)(SP), B5
- MOVOU (8*16 + 6*16)(SP), B6
- MOVOU (8*16 + 7*16)(SP), B7
+ MOVOU X11, 144(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 156(SP)
+ MOVOU X11, 160(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 172(SP)
+ MOVOU X11, 176(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 188(SP)
+ MOVOU X11, 192(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 204(SP)
+ MOVOU X11, 208(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 220(SP)
+ MOVOU X11, 224(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 236(SP)
+ MOVOU X11, 240(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 252(SP)
+ MOVOU 128(SP), X0
+ MOVOU 144(SP), X1
+ MOVOU 160(SP), X2
+ MOVOU 176(SP), X3
+ MOVOU 192(SP), X4
+ MOVOU 208(SP), X5
+ MOVOU 224(SP), X6
+ MOVOU 240(SP), X7
+ MOVOU 16(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 140(SP)
+ MOVOU 32(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 156(SP)
+ MOVOU 48(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 172(SP)
+ MOVOU 64(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 188(SP)
+ MOVOU 80(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 204(SP)
+ MOVOU 96(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 220(SP)
+ MOVOU 112(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 236(SP)
+ MOVOU 128(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 252(SP)
+ MOVOU 144(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 160(AX), X11
+ CMPQ R13, $0x0c
+ JB encLast1
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 176(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 192(AX), X11
+ JE encLast1
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 208(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 224(AX), X11
- aesRound(1)
- increment(0)
- aesRound(2)
- increment(1)
- aesRound(3)
- increment(2)
- aesRound(4)
- increment(3)
- aesRound(5)
- increment(4)
- aesRound(6)
- increment(5)
- aesRound(7)
- increment(6)
- aesRound(8)
- increment(7)
- aesRound(9)
- MOVOU (16*10)(ks), T0
- CMPQ NR, $12
- JB encLast1
- aesRnd(T0)
- aesRound(11)
- MOVOU (16*12)(ks), T0
- JE encLast1
- aesRnd(T0)
- aesRound(13)
- MOVOU (16*14)(ks), T0
encLast1:
- aesRndLast(T0)
-
- MOVOU (16*0)(ptx), T0
- PXOR T0, B0
- MOVOU (16*1)(ptx), T0
- PXOR T0, B1
- MOVOU (16*2)(ptx), T0
- PXOR T0, B2
- MOVOU (16*3)(ptx), T0
- PXOR T0, B3
- MOVOU (16*4)(ptx), T0
- PXOR T0, B4
- MOVOU (16*5)(ptx), T0
- PXOR T0, B5
- MOVOU (16*6)(ptx), T0
- PXOR T0, B6
- MOVOU (16*7)(ptx), T0
- PXOR T0, B7
-
- MOVOU B0, (16*0)(ctx)
- PSHUFB BSWAP, B0
- PXOR ACC0, B0
- MOVOU B1, (16*1)(ctx)
- PSHUFB BSWAP, B1
- MOVOU B2, (16*2)(ctx)
- PSHUFB BSWAP, B2
- MOVOU B3, (16*3)(ctx)
- PSHUFB BSWAP, B3
- MOVOU B4, (16*4)(ctx)
- PSHUFB BSWAP, B4
- MOVOU B5, (16*5)(ctx)
- PSHUFB BSWAP, B5
- MOVOU B6, (16*6)(ctx)
- PSHUFB BSWAP, B6
- MOVOU B7, (16*7)(ctx)
- PSHUFB BSWAP, B7
-
- MOVOU B0, (16*0)(SP)
- MOVOU B1, (16*1)(SP)
- MOVOU B2, (16*2)(SP)
- MOVOU B3, (16*3)(SP)
- MOVOU B4, (16*4)(SP)
- MOVOU B5, (16*5)(SP)
- MOVOU B6, (16*6)(SP)
- MOVOU B7, (16*7)(SP)
-
- LEAQ 128(ptx), ptx
- LEAQ 128(ctx), ctx
+ AESENCLAST X11, X0
+ AESENCLAST X11, X1
+ AESENCLAST X11, X2
+ AESENCLAST X11, X3
+ AESENCLAST X11, X4
+ AESENCLAST X11, X5
+ AESENCLAST X11, X6
+ AESENCLAST X11, X7
+ MOVOU (SI), X11
+ PXOR X11, X0
+ MOVOU 16(SI), X11
+ PXOR X11, X1
+ MOVOU 32(SI), X11
+ PXOR X11, X2
+ MOVOU 48(SI), X11
+ PXOR X11, X3
+ MOVOU 64(SI), X11
+ PXOR X11, X4
+ MOVOU 80(SI), X11
+ PXOR X11, X5
+ MOVOU 96(SI), X11
+ PXOR X11, X6
+ MOVOU 112(SI), X11
+ PXOR X11, X7
+ MOVOU X0, (DX)
+ PSHUFB X15, X0
+ PXOR X8, X0
+ MOVOU X1, 16(DX)
+ PSHUFB X15, X1
+ MOVOU X2, 32(DX)
+ PSHUFB X15, X2
+ MOVOU X3, 48(DX)
+ PSHUFB X15, X3
+ MOVOU X4, 64(DX)
+ PSHUFB X15, X4
+ MOVOU X5, 80(DX)
+ PSHUFB X15, X5
+ MOVOU X6, 96(DX)
+ PSHUFB X15, X6
+ MOVOU X7, 112(DX)
+ PSHUFB X15, X7
+ MOVOU X0, (SP)
+ MOVOU X1, 16(SP)
+ MOVOU X2, 32(SP)
+ MOVOU X3, 48(SP)
+ MOVOU X4, 64(SP)
+ MOVOU X5, 80(SP)
+ MOVOU X6, 96(SP)
+ MOVOU X7, 112(SP)
+ LEAQ 128(SI), SI
+ LEAQ 128(DX), DX
gcmAesEncOctetsLoop:
+ CMPQ R9, $0x80
+ JB gcmAesEncOctetsEnd
+ SUBQ $0x80, R9
+ MOVOU 128(SP), X0
+ MOVOU 144(SP), X1
+ MOVOU 160(SP), X2
+ MOVOU 176(SP), X3
+ MOVOU 192(SP), X4
+ MOVOU 208(SP), X5
+ MOVOU 224(SP), X6
+ MOVOU 240(SP), X7
+ MOVOU (SP), X11
+ PSHUFD $0x4e, X11, X12
+ PXOR X11, X12
+ MOVOU (DI), X8
+ MOVOU 16(DI), X10
+ MOVOU X8, X9
+ PCLMULQDQ $0x00, X12, X10
+ PCLMULQDQ $0x00, X11, X8
+ PCLMULQDQ $0x11, X11, X9
+ MOVOU 16(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 32(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 16(SP), X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 48(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 140(SP)
+ MOVOU 32(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 64(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 32(SP), X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 80(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 156(SP)
+ MOVOU 48(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 96(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 48(SP), X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 112(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 172(SP)
+ MOVOU 64(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 128(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 64(SP), X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 144(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 188(SP)
+ MOVOU 80(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 160(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 80(SP), X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 176(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 204(SP)
+ MOVOU 96(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 192(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 96(SP), X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 208(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 220(SP)
+ MOVOU 112(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 224(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 112(SP), X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 240(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 236(SP)
+ MOVOU 128(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 252(SP)
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU 144(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ MOVOU 160(AX), X11
+ CMPQ R13, $0x0c
+ JB encLast2
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 176(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 192(AX), X11
+ JE encLast2
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 208(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 224(AX), X11
- CMPQ ptxLen, $128
- JB gcmAesEncOctetsEnd
- SUBQ $128, ptxLen
-
- MOVOU (8*16 + 0*16)(SP), B0
- MOVOU (8*16 + 1*16)(SP), B1
- MOVOU (8*16 + 2*16)(SP), B2
- MOVOU (8*16 + 3*16)(SP), B3
- MOVOU (8*16 + 4*16)(SP), B4
- MOVOU (8*16 + 5*16)(SP), B5
- MOVOU (8*16 + 6*16)(SP), B6
- MOVOU (8*16 + 7*16)(SP), B7
-
- MOVOU (16*0)(SP), T0
- PSHUFD $78, T0, T1
- PXOR T0, T1
-
- MOVOU (16*0)(pTbl), ACC0
- MOVOU (16*1)(pTbl), ACCM
- MOVOU ACC0, ACC1
-
- PCLMULQDQ $0x00, T1, ACCM
- PCLMULQDQ $0x00, T0, ACC0
- PCLMULQDQ $0x11, T0, ACC1
-
- combinedRound(1)
- increment(0)
- combinedRound(2)
- increment(1)
- combinedRound(3)
- increment(2)
- combinedRound(4)
- increment(3)
- combinedRound(5)
- increment(4)
- combinedRound(6)
- increment(5)
- combinedRound(7)
- increment(6)
-
- aesRound(8)
- increment(7)
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- reduceRound(ACC0)
- aesRound(9)
-
- reduceRound(ACC0)
- PXOR ACC1, ACC0
-
- MOVOU (16*10)(ks), T0
- CMPQ NR, $12
- JB encLast2
- aesRnd(T0)
- aesRound(11)
- MOVOU (16*12)(ks), T0
- JE encLast2
- aesRnd(T0)
- aesRound(13)
- MOVOU (16*14)(ks), T0
encLast2:
- aesRndLast(T0)
-
- MOVOU (16*0)(ptx), T0
- PXOR T0, B0
- MOVOU (16*1)(ptx), T0
- PXOR T0, B1
- MOVOU (16*2)(ptx), T0
- PXOR T0, B2
- MOVOU (16*3)(ptx), T0
- PXOR T0, B3
- MOVOU (16*4)(ptx), T0
- PXOR T0, B4
- MOVOU (16*5)(ptx), T0
- PXOR T0, B5
- MOVOU (16*6)(ptx), T0
- PXOR T0, B6
- MOVOU (16*7)(ptx), T0
- PXOR T0, B7
-
- MOVOU B0, (16*0)(ctx)
- PSHUFB BSWAP, B0
- PXOR ACC0, B0
- MOVOU B1, (16*1)(ctx)
- PSHUFB BSWAP, B1
- MOVOU B2, (16*2)(ctx)
- PSHUFB BSWAP, B2
- MOVOU B3, (16*3)(ctx)
- PSHUFB BSWAP, B3
- MOVOU B4, (16*4)(ctx)
- PSHUFB BSWAP, B4
- MOVOU B5, (16*5)(ctx)
- PSHUFB BSWAP, B5
- MOVOU B6, (16*6)(ctx)
- PSHUFB BSWAP, B6
- MOVOU B7, (16*7)(ctx)
- PSHUFB BSWAP, B7
-
- MOVOU B0, (16*0)(SP)
- MOVOU B1, (16*1)(SP)
- MOVOU B2, (16*2)(SP)
- MOVOU B3, (16*3)(SP)
- MOVOU B4, (16*4)(SP)
- MOVOU B5, (16*5)(SP)
- MOVOU B6, (16*6)(SP)
- MOVOU B7, (16*7)(SP)
-
- LEAQ 128(ptx), ptx
- LEAQ 128(ctx), ctx
-
- JMP gcmAesEncOctetsLoop
+ AESENCLAST X11, X0
+ AESENCLAST X11, X1
+ AESENCLAST X11, X2
+ AESENCLAST X11, X3
+ AESENCLAST X11, X4
+ AESENCLAST X11, X5
+ AESENCLAST X11, X6
+ AESENCLAST X11, X7
+ MOVOU (SI), X11
+ PXOR X11, X0
+ MOVOU 16(SI), X11
+ PXOR X11, X1
+ MOVOU 32(SI), X11
+ PXOR X11, X2
+ MOVOU 48(SI), X11
+ PXOR X11, X3
+ MOVOU 64(SI), X11
+ PXOR X11, X4
+ MOVOU 80(SI), X11
+ PXOR X11, X5
+ MOVOU 96(SI), X11
+ PXOR X11, X6
+ MOVOU 112(SI), X11
+ PXOR X11, X7
+ MOVOU X0, (DX)
+ PSHUFB X15, X0
+ PXOR X8, X0
+ MOVOU X1, 16(DX)
+ PSHUFB X15, X1
+ MOVOU X2, 32(DX)
+ PSHUFB X15, X2
+ MOVOU X3, 48(DX)
+ PSHUFB X15, X3
+ MOVOU X4, 64(DX)
+ PSHUFB X15, X4
+ MOVOU X5, 80(DX)
+ PSHUFB X15, X5
+ MOVOU X6, 96(DX)
+ PSHUFB X15, X6
+ MOVOU X7, 112(DX)
+ PSHUFB X15, X7
+ MOVOU X0, (SP)
+ MOVOU X1, 16(SP)
+ MOVOU X2, 32(SP)
+ MOVOU X3, 48(SP)
+ MOVOU X4, 64(SP)
+ MOVOU X5, 80(SP)
+ MOVOU X6, 96(SP)
+ MOVOU X7, 112(SP)
+ LEAQ 128(SI), SI
+ LEAQ 128(DX), DX
+ JMP gcmAesEncOctetsLoop
gcmAesEncOctetsEnd:
-
- MOVOU (16*0)(SP), T0
- MOVOU (16*0)(pTbl), ACC0
- MOVOU (16*1)(pTbl), ACCM
- MOVOU ACC0, ACC1
- PSHUFD $78, T0, T1
- PXOR T0, T1
- PCLMULQDQ $0x00, T0, ACC0
- PCLMULQDQ $0x11, T0, ACC1
- PCLMULQDQ $0x00, T1, ACCM
-
- mulRound(1)
- mulRound(2)
- mulRound(3)
- mulRound(4)
- mulRound(5)
- mulRound(6)
- mulRound(7)
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- reduceRound(ACC0)
- reduceRound(ACC0)
- PXOR ACC1, ACC0
-
- TESTQ ptxLen, ptxLen
- JE gcmAesEncDone
-
- SUBQ $7, aluCTR
+ MOVOU (SP), X11
+ MOVOU (DI), X8
+ MOVOU 16(DI), X10
+ MOVOU X8, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X11, X12
+ PCLMULQDQ $0x00, X11, X8
+ PCLMULQDQ $0x11, X11, X9
+ PCLMULQDQ $0x00, X12, X10
+ MOVOU 16(SP), X11
+ MOVOU 32(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X12, X11
+ MOVOU 48(DI), X12
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X10
+ MOVOU 32(SP), X11
+ MOVOU 64(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X12, X11
+ MOVOU 80(DI), X12
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X10
+ MOVOU 48(SP), X11
+ MOVOU 96(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X12, X11
+ MOVOU 112(DI), X12
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X10
+ MOVOU 64(SP), X11
+ MOVOU 128(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X12, X11
+ MOVOU 144(DI), X12
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X10
+ MOVOU 80(SP), X11
+ MOVOU 160(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X12, X11
+ MOVOU 176(DI), X12
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X10
+ MOVOU 96(SP), X11
+ MOVOU 192(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X12, X11
+ MOVOU 208(DI), X12
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X10
+ MOVOU 112(SP), X11
+ MOVOU 224(DI), X12
+ MOVOU X12, X13
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X13, X9
+ PSHUFD $0x4e, X11, X12
+ PXOR X12, X11
+ MOVOU 240(DI), X12
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ TESTQ R9, R9
+ JE gcmAesEncDone
+ SUBQ $0x07, R10
gcmAesEncSingles:
-
- MOVOU (16*1)(ks), B1
- MOVOU (16*2)(ks), B2
- MOVOU (16*3)(ks), B3
- MOVOU (16*4)(ks), B4
- MOVOU (16*5)(ks), B5
- MOVOU (16*6)(ks), B6
- MOVOU (16*7)(ks), B7
-
- MOVOU (16*14)(pTbl), T2
+ MOVOU 16(AX), X1
+ MOVOU 32(AX), X2
+ MOVOU 48(AX), X3
+ MOVOU 64(AX), X4
+ MOVOU 80(AX), X5
+ MOVOU 96(AX), X6
+ MOVOU 112(AX), X7
+ MOVOU 224(DI), X13
gcmAesEncSinglesLoop:
+ CMPQ R9, $0x10
+ JB gcmAesEncTail
+ SUBQ $0x10, R9
+ MOVOU 128(SP), X0
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 140(SP)
+ AESENC X1, X0
+ AESENC X2, X0
+ AESENC X3, X0
+ AESENC X4, X0
+ AESENC X5, X0
+ AESENC X6, X0
+ AESENC X7, X0
+ MOVOU 128(AX), X11
+ AESENC X11, X0
+ MOVOU 144(AX), X11
+ AESENC X11, X0
+ MOVOU 160(AX), X11
+ CMPQ R13, $0x0c
+ JB encLast3
+ AESENC X11, X0
+ MOVOU 176(AX), X11
+ AESENC X11, X0
+ MOVOU 192(AX), X11
+ JE encLast3
+ AESENC X11, X0
+ MOVOU 208(AX), X11
+ AESENC X11, X0
+ MOVOU 224(AX), X11
- CMPQ ptxLen, $16
- JB gcmAesEncTail
- SUBQ $16, ptxLen
-
- MOVOU (8*16 + 0*16)(SP), B0
- increment(0)
-
- AESENC B1, B0
- AESENC B2, B0
- AESENC B3, B0
- AESENC B4, B0
- AESENC B5, B0
- AESENC B6, B0
- AESENC B7, B0
- MOVOU (16*8)(ks), T0
- AESENC T0, B0
- MOVOU (16*9)(ks), T0
- AESENC T0, B0
- MOVOU (16*10)(ks), T0
- CMPQ NR, $12
- JB encLast3
- AESENC T0, B0
- MOVOU (16*11)(ks), T0
- AESENC T0, B0
- MOVOU (16*12)(ks), T0
- JE encLast3
- AESENC T0, B0
- MOVOU (16*13)(ks), T0
- AESENC T0, B0
- MOVOU (16*14)(ks), T0
encLast3:
- AESENCLAST T0, B0
-
- MOVOU (ptx), T0
- PXOR T0, B0
- MOVOU B0, (ctx)
-
- PSHUFB BSWAP, B0
- PXOR ACC0, B0
-
- MOVOU T2, ACC0
- MOVOU T2, ACC1
- MOVOU (16*15)(pTbl), ACCM
-
- PSHUFD $78, B0, T0
- PXOR B0, T0
- PCLMULQDQ $0x00, B0, ACC0
- PCLMULQDQ $0x11, B0, ACC1
- PCLMULQDQ $0x00, T0, ACCM
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- reduceRound(ACC0)
- reduceRound(ACC0)
- PXOR ACC1, ACC0
-
- LEAQ (16*1)(ptx), ptx
- LEAQ (16*1)(ctx), ctx
-
- JMP gcmAesEncSinglesLoop
+ AESENCLAST X11, X0
+ MOVOU (SI), X11
+ PXOR X11, X0
+ MOVOU X0, (DX)
+ PSHUFB X15, X0
+ PXOR X8, X0
+ MOVOU X13, X8
+ MOVOU X13, X9
+ MOVOU 240(DI), X10
+ PSHUFD $0x4e, X0, X11
+ PXOR X0, X11
+ PCLMULQDQ $0x00, X0, X8
+ PCLMULQDQ $0x11, X0, X9
+ PCLMULQDQ $0x00, X11, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ LEAQ 16(SI), SI
+ LEAQ 16(DX), DX
+ JMP gcmAesEncSinglesLoop
gcmAesEncTail:
- TESTQ ptxLen, ptxLen
- JE gcmAesEncDone
+ TESTQ R9, R9
+ JE gcmAesEncDone
+ MOVOU 128(SP), X0
+ AESENC X1, X0
+ AESENC X2, X0
+ AESENC X3, X0
+ AESENC X4, X0
+ AESENC X5, X0
+ AESENC X6, X0
+ AESENC X7, X0
+ MOVOU 128(AX), X11
+ AESENC X11, X0
+ MOVOU 144(AX), X11
+ AESENC X11, X0
+ MOVOU 160(AX), X11
+ CMPQ R13, $0x0c
+ JB encLast4
+ AESENC X11, X0
+ MOVOU 176(AX), X11
+ AESENC X11, X0
+ MOVOU 192(AX), X11
+ JE encLast4
+ AESENC X11, X0
+ MOVOU 208(AX), X11
+ AESENC X11, X0
+ MOVOU 224(AX), X11
- MOVOU (8*16 + 0*16)(SP), B0
- AESENC B1, B0
- AESENC B2, B0
- AESENC B3, B0
- AESENC B4, B0
- AESENC B5, B0
- AESENC B6, B0
- AESENC B7, B0
- MOVOU (16*8)(ks), T0
- AESENC T0, B0
- MOVOU (16*9)(ks), T0
- AESENC T0, B0
- MOVOU (16*10)(ks), T0
- CMPQ NR, $12
- JB encLast4
- AESENC T0, B0
- MOVOU (16*11)(ks), T0
- AESENC T0, B0
- MOVOU (16*12)(ks), T0
- JE encLast4
- AESENC T0, B0
- MOVOU (16*13)(ks), T0
- AESENC T0, B0
- MOVOU (16*14)(ks), T0
encLast4:
- AESENCLAST T0, B0
- MOVOU B0, T0
-
- LEAQ -1(ptx)(ptxLen*1), ptx
+ AESENCLAST X11, X0
+ MOVOU X0, X11
+ LEAQ -1(SI)(R9*1), SI
+ MOVQ R9, R11
+ SHLQ $0x04, R11
+ LEAQ andMask<>+0(SB), R10
+ MOVOU -16(R10)(R11*1), X12
+ PXOR X0, X0
- MOVQ ptxLen, aluTMP
- SHLQ $4, aluTMP
-
- LEAQ andMask<>(SB), aluCTR
- MOVOU -16(aluCTR)(aluTMP*1), T1
-
- PXOR B0, B0
ptxLoadLoop:
- PSLLDQ $1, B0
- PINSRB $0, (ptx), B0
- LEAQ -1(ptx), ptx
- DECQ ptxLen
- JNE ptxLoadLoop
-
- PXOR T0, B0
- PAND T1, B0
- MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
-
- PSHUFB BSWAP, B0
- PXOR ACC0, B0
-
- MOVOU T2, ACC0
- MOVOU T2, ACC1
- MOVOU (16*15)(pTbl), ACCM
-
- PSHUFD $78, B0, T0
- PXOR B0, T0
- PCLMULQDQ $0x00, B0, ACC0
- PCLMULQDQ $0x11, B0, ACC1
- PCLMULQDQ $0x00, T0, ACCM
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- reduceRound(ACC0)
- reduceRound(ACC0)
- PXOR ACC1, ACC0
+ PSLLDQ $0x01, X0
+ PINSRB $0x00, (SI), X0
+ LEAQ -1(SI), SI
+ DECQ R9
+ JNE ptxLoadLoop
+ PXOR X11, X0
+ PAND X12, X0
+ MOVOU X0, (DX)
+ PSHUFB X15, X0
+ PXOR X8, X0
+ MOVOU X13, X8
+ MOVOU X13, X9
+ MOVOU 240(DI), X10
+ PSHUFD $0x4e, X0, X11
+ PXOR X0, X11
+ PCLMULQDQ $0x00, X0, X8
+ PCLMULQDQ $0x11, X0, X9
+ PCLMULQDQ $0x00, X11, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
gcmAesEncDone:
- MOVOU ACC0, (tPtr)
+ MOVOU X8, (R8)
RET
-#undef increment
-
-// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
-TEXT Ā·gcmAesDec(SB),0,$128-96
-#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
-#define combinedDecRound(i) \
- MOVOU (16*i)(ks), T0;\
- AESENC T0, B0;\
- AESENC T0, B1;\
- AESENC T0, B2;\
- AESENC T0, B3;\
- MOVOU (16*(i*2))(pTbl), T1;\
- MOVOU T1, T2;\
- AESENC T0, B4;\
- AESENC T0, B5;\
- AESENC T0, B6;\
- AESENC T0, B7;\
- MOVOU (16*i)(ctx), T0;\
- PSHUFB BSWAP, T0;\
- PCLMULQDQ $0x00, T0, T1;\
- PXOR T1, ACC0;\
- PSHUFD $78, T0, T1;\
- PCLMULQDQ $0x11, T0, T2;\
- PXOR T1, T0;\
- PXOR T2, ACC1;\
- MOVOU (16*(i*2+1))(pTbl), T2;\
- PCLMULQDQ $0x00, T2, T0;\
- PXOR T0, ACCM
-
- MOVQ productTable+0(FP), pTbl
- MOVQ dst+8(FP), ptx
- MOVQ src_base+32(FP), ctx
- MOVQ src_len+40(FP), ptxLen
- MOVQ ctr+56(FP), ctrPtr
- MOVQ T+64(FP), tPtr
- MOVQ ks_base+72(FP), ks
- MOVQ ks_len+80(FP), NR
-
- SHRQ $2, NR
- DECQ NR
-
- MOVOU bswapMask<>(SB), BSWAP
- MOVOU gcmPoly<>(SB), POLY
-
- MOVOU (tPtr), ACC0
- PXOR ACC1, ACC1
- PXOR ACCM, ACCM
- MOVOU (ctrPtr), B0
- MOVL (3*4)(ctrPtr), aluCTR
- MOVOU (ks), T0
- MOVL (3*4)(ks), aluK
- BSWAPL aluCTR
- BSWAPL aluK
-
- PXOR B0, T0
- MOVOU T0, (0*16)(SP)
- increment(0)
- CMPQ ptxLen, $128
- JB gcmAesDecSingles
-
- MOVOU T0, (1*16)(SP)
- increment(1)
- MOVOU T0, (2*16)(SP)
- increment(2)
- MOVOU T0, (3*16)(SP)
- increment(3)
- MOVOU T0, (4*16)(SP)
- increment(4)
- MOVOU T0, (5*16)(SP)
- increment(5)
- MOVOU T0, (6*16)(SP)
- increment(6)
- MOVOU T0, (7*16)(SP)
- increment(7)
+DATA andMask<>+0(SB)/8, $0x00000000000000ff
+DATA andMask<>+8(SB)/8, $0x0000000000000000
+DATA andMask<>+16(SB)/8, $0x000000000000ffff
+DATA andMask<>+24(SB)/8, $0x0000000000000000
+DATA andMask<>+32(SB)/8, $0x0000000000ffffff
+DATA andMask<>+40(SB)/8, $0x0000000000000000
+DATA andMask<>+48(SB)/8, $0x00000000ffffffff
+DATA andMask<>+56(SB)/8, $0x0000000000000000
+DATA andMask<>+64(SB)/8, $0x000000ffffffffff
+DATA andMask<>+72(SB)/8, $0x0000000000000000
+DATA andMask<>+80(SB)/8, $0x0000ffffffffffff
+DATA andMask<>+88(SB)/8, $0x0000000000000000
+DATA andMask<>+96(SB)/8, $0x00ffffffffffffff
+DATA andMask<>+104(SB)/8, $0x0000000000000000
+DATA andMask<>+112(SB)/8, $0xffffffffffffffff
+DATA andMask<>+120(SB)/8, $0x0000000000000000
+DATA andMask<>+128(SB)/8, $0xffffffffffffffff
+DATA andMask<>+136(SB)/8, $0x00000000000000ff
+DATA andMask<>+144(SB)/8, $0xffffffffffffffff
+DATA andMask<>+152(SB)/8, $0x000000000000ffff
+DATA andMask<>+160(SB)/8, $0xffffffffffffffff
+DATA andMask<>+168(SB)/8, $0x0000000000ffffff
+DATA andMask<>+176(SB)/8, $0xffffffffffffffff
+DATA andMask<>+184(SB)/8, $0x00000000ffffffff
+DATA andMask<>+192(SB)/8, $0xffffffffffffffff
+DATA andMask<>+200(SB)/8, $0x000000ffffffffff
+DATA andMask<>+208(SB)/8, $0xffffffffffffffff
+DATA andMask<>+216(SB)/8, $0x0000ffffffffffff
+DATA andMask<>+224(SB)/8, $0xffffffffffffffff
+DATA andMask<>+232(SB)/8, $0x00ffffffffffffff
+GLOBL andMask<>(SB), RODATA|NOPTR, $240
+
+// func gcmAesDec(productTable *[256]byte, dst []byte, src []byte, ctr *[16]byte, T *[16]byte, ks []uint32)
+// Requires: AES, PCLMULQDQ, SSE2, SSE4.1, SSSE3
+TEXT Ā·gcmAesDec(SB), $128-96
+ MOVQ productTable+0(FP), DI
+ MOVQ dst_base+8(FP), SI
+ MOVQ src_base+32(FP), DX
+ MOVQ src_len+40(FP), R9
+ MOVQ ctr+56(FP), CX
+ MOVQ T+64(FP), R8
+ MOVQ ks_base+72(FP), AX
+ MOVQ ks_len+80(FP), R13
+ SHRQ $0x02, R13
+ DECQ R13
+ MOVOU bswapMask<>+0(SB), X15
+ MOVOU gcmPoly<>+0(SB), X14
+ MOVOU (R8), X8
+ PXOR X9, X9
+ PXOR X10, X10
+ MOVOU (CX), X0
+ MOVL 12(CX), R10
+ MOVOU (AX), X11
+ MOVL 12(AX), R12
+ BSWAPL R10
+ BSWAPL R12
+ PXOR X0, X11
+ MOVOU X11, (SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 12(SP)
+ CMPQ R9, $0x80
+ JB gcmAesDecSingles
+ MOVOU X11, 16(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 28(SP)
+ MOVOU X11, 32(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 44(SP)
+ MOVOU X11, 48(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 60(SP)
+ MOVOU X11, 64(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 76(SP)
+ MOVOU X11, 80(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 92(SP)
+ MOVOU X11, 96(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 108(SP)
+ MOVOU X11, 112(SP)
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 124(SP)
gcmAesDecOctetsLoop:
+ CMPQ R9, $0x80
+ JB gcmAesDecEndOctets
+ SUBQ $0x80, R9
+ MOVOU (SP), X0
+ MOVOU 16(SP), X1
+ MOVOU 32(SP), X2
+ MOVOU 48(SP), X3
+ MOVOU 64(SP), X4
+ MOVOU 80(SP), X5
+ MOVOU 96(SP), X6
+ MOVOU 112(SP), X7
+ MOVOU (DX), X11
+ PSHUFB X15, X11
+ PXOR X8, X11
+ PSHUFD $0x4e, X11, X12
+ PXOR X11, X12
+ MOVOU (DI), X8
+ MOVOU 16(DI), X10
+ MOVOU X8, X9
+ PCLMULQDQ $0x00, X12, X10
+ PCLMULQDQ $0x00, X11, X8
+ PCLMULQDQ $0x11, X11, X9
+ MOVOU 16(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 32(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 16(DX), X11
+ PSHUFB X15, X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 48(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 12(SP)
+ MOVOU 32(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 64(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 32(DX), X11
+ PSHUFB X15, X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 80(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 28(SP)
+ MOVOU 48(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 96(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 48(DX), X11
+ PSHUFB X15, X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 112(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 44(SP)
+ MOVOU 64(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 128(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 64(DX), X11
+ PSHUFB X15, X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 144(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 60(SP)
+ MOVOU 80(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 160(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 80(DX), X11
+ PSHUFB X15, X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 176(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 76(SP)
+ MOVOU 96(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 192(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 96(DX), X11
+ PSHUFB X15, X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 208(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 92(SP)
+ MOVOU 112(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ MOVOU 224(DI), X12
+ MOVOU X12, X13
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 112(DX), X11
+ PSHUFB X15, X11
+ PCLMULQDQ $0x00, X11, X12
+ PXOR X12, X8
+ PSHUFD $0x4e, X11, X12
+ PCLMULQDQ $0x11, X11, X13
+ PXOR X12, X11
+ PXOR X13, X9
+ MOVOU 240(DI), X13
+ PCLMULQDQ $0x00, X13, X11
+ PXOR X11, X10
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 108(SP)
+ MOVOU 128(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 124(SP)
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU 144(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ MOVOU 160(AX), X11
+ CMPQ R13, $0x0c
+ JB decLast1
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 176(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 192(AX), X11
+ JE decLast1
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 208(AX), X11
+ AESENC X11, X0
+ AESENC X11, X1
+ AESENC X11, X2
+ AESENC X11, X3
+ AESENC X11, X4
+ AESENC X11, X5
+ AESENC X11, X6
+ AESENC X11, X7
+ MOVOU 224(AX), X11
- CMPQ ptxLen, $128
- JB gcmAesDecEndOctets
- SUBQ $128, ptxLen
-
- MOVOU (0*16)(SP), B0
- MOVOU (1*16)(SP), B1
- MOVOU (2*16)(SP), B2
- MOVOU (3*16)(SP), B3
- MOVOU (4*16)(SP), B4
- MOVOU (5*16)(SP), B5
- MOVOU (6*16)(SP), B6
- MOVOU (7*16)(SP), B7
-
- MOVOU (16*0)(ctx), T0
- PSHUFB BSWAP, T0
- PXOR ACC0, T0
- PSHUFD $78, T0, T1
- PXOR T0, T1
-
- MOVOU (16*0)(pTbl), ACC0
- MOVOU (16*1)(pTbl), ACCM
- MOVOU ACC0, ACC1
-
- PCLMULQDQ $0x00, T1, ACCM
- PCLMULQDQ $0x00, T0, ACC0
- PCLMULQDQ $0x11, T0, ACC1
-
- combinedDecRound(1)
- increment(0)
- combinedDecRound(2)
- increment(1)
- combinedDecRound(3)
- increment(2)
- combinedDecRound(4)
- increment(3)
- combinedDecRound(5)
- increment(4)
- combinedDecRound(6)
- increment(5)
- combinedDecRound(7)
- increment(6)
-
- aesRound(8)
- increment(7)
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- reduceRound(ACC0)
- aesRound(9)
-
- reduceRound(ACC0)
- PXOR ACC1, ACC0
-
- MOVOU (16*10)(ks), T0
- CMPQ NR, $12
- JB decLast1
- aesRnd(T0)
- aesRound(11)
- MOVOU (16*12)(ks), T0
- JE decLast1
- aesRnd(T0)
- aesRound(13)
- MOVOU (16*14)(ks), T0
decLast1:
- aesRndLast(T0)
-
- MOVOU (16*0)(ctx), T0
- PXOR T0, B0
- MOVOU (16*1)(ctx), T0
- PXOR T0, B1
- MOVOU (16*2)(ctx), T0
- PXOR T0, B2
- MOVOU (16*3)(ctx), T0
- PXOR T0, B3
- MOVOU (16*4)(ctx), T0
- PXOR T0, B4
- MOVOU (16*5)(ctx), T0
- PXOR T0, B5
- MOVOU (16*6)(ctx), T0
- PXOR T0, B6
- MOVOU (16*7)(ctx), T0
- PXOR T0, B7
-
- MOVOU B0, (16*0)(ptx)
- MOVOU B1, (16*1)(ptx)
- MOVOU B2, (16*2)(ptx)
- MOVOU B3, (16*3)(ptx)
- MOVOU B4, (16*4)(ptx)
- MOVOU B5, (16*5)(ptx)
- MOVOU B6, (16*6)(ptx)
- MOVOU B7, (16*7)(ptx)
-
- LEAQ 128(ptx), ptx
- LEAQ 128(ctx), ctx
-
- JMP gcmAesDecOctetsLoop
+ AESENCLAST X11, X0
+ AESENCLAST X11, X1
+ AESENCLAST X11, X2
+ AESENCLAST X11, X3
+ AESENCLAST X11, X4
+ AESENCLAST X11, X5
+ AESENCLAST X11, X6
+ AESENCLAST X11, X7
+ MOVOU (DX), X11
+ PXOR X11, X0
+ MOVOU 16(DX), X11
+ PXOR X11, X1
+ MOVOU 32(DX), X11
+ PXOR X11, X2
+ MOVOU 48(DX), X11
+ PXOR X11, X3
+ MOVOU 64(DX), X11
+ PXOR X11, X4
+ MOVOU 80(DX), X11
+ PXOR X11, X5
+ MOVOU 96(DX), X11
+ PXOR X11, X6
+ MOVOU 112(DX), X11
+ PXOR X11, X7
+ MOVOU X0, (SI)
+ MOVOU X1, 16(SI)
+ MOVOU X2, 32(SI)
+ MOVOU X3, 48(SI)
+ MOVOU X4, 64(SI)
+ MOVOU X5, 80(SI)
+ MOVOU X6, 96(SI)
+ MOVOU X7, 112(SI)
+ LEAQ 128(SI), SI
+ LEAQ 128(DX), DX
+ JMP gcmAesDecOctetsLoop
gcmAesDecEndOctets:
-
- SUBQ $7, aluCTR
+ SUBQ $0x07, R10
gcmAesDecSingles:
-
- MOVOU (16*1)(ks), B1
- MOVOU (16*2)(ks), B2
- MOVOU (16*3)(ks), B3
- MOVOU (16*4)(ks), B4
- MOVOU (16*5)(ks), B5
- MOVOU (16*6)(ks), B6
- MOVOU (16*7)(ks), B7
-
- MOVOU (16*14)(pTbl), T2
+ MOVOU 16(AX), X1
+ MOVOU 32(AX), X2
+ MOVOU 48(AX), X3
+ MOVOU 64(AX), X4
+ MOVOU 80(AX), X5
+ MOVOU 96(AX), X6
+ MOVOU 112(AX), X7
+ MOVOU 224(DI), X13
gcmAesDecSinglesLoop:
+ CMPQ R9, $0x10
+ JB gcmAesDecTail
+ SUBQ $0x10, R9
+ MOVOU (DX), X0
+ MOVOU X0, X12
+ PSHUFB X15, X0
+ PXOR X8, X0
+ MOVOU X13, X8
+ MOVOU X13, X9
+ MOVOU 240(DI), X10
+ PCLMULQDQ $0x00, X0, X8
+ PCLMULQDQ $0x11, X0, X9
+ PSHUFD $0x4e, X0, X11
+ PXOR X0, X11
+ PCLMULQDQ $0x00, X11, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ MOVOU (SP), X0
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 12(SP)
+ AESENC X1, X0
+ AESENC X2, X0
+ AESENC X3, X0
+ AESENC X4, X0
+ AESENC X5, X0
+ AESENC X6, X0
+ AESENC X7, X0
+ MOVOU 128(AX), X11
+ AESENC X11, X0
+ MOVOU 144(AX), X11
+ AESENC X11, X0
+ MOVOU 160(AX), X11
+ CMPQ R13, $0x0c
+ JB decLast2
+ AESENC X11, X0
+ MOVOU 176(AX), X11
+ AESENC X11, X0
+ MOVOU 192(AX), X11
+ JE decLast2
+ AESENC X11, X0
+ MOVOU 208(AX), X11
+ AESENC X11, X0
+ MOVOU 224(AX), X11
- CMPQ ptxLen, $16
- JB gcmAesDecTail
- SUBQ $16, ptxLen
-
- MOVOU (ctx), B0
- MOVOU B0, T1
- PSHUFB BSWAP, B0
- PXOR ACC0, B0
-
- MOVOU T2, ACC0
- MOVOU T2, ACC1
- MOVOU (16*15)(pTbl), ACCM
-
- PCLMULQDQ $0x00, B0, ACC0
- PCLMULQDQ $0x11, B0, ACC1
- PSHUFD $78, B0, T0
- PXOR B0, T0
- PCLMULQDQ $0x00, T0, ACCM
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- reduceRound(ACC0)
- reduceRound(ACC0)
- PXOR ACC1, ACC0
-
- MOVOU (0*16)(SP), B0
- increment(0)
- AESENC B1, B0
- AESENC B2, B0
- AESENC B3, B0
- AESENC B4, B0
- AESENC B5, B0
- AESENC B6, B0
- AESENC B7, B0
- MOVOU (16*8)(ks), T0
- AESENC T0, B0
- MOVOU (16*9)(ks), T0
- AESENC T0, B0
- MOVOU (16*10)(ks), T0
- CMPQ NR, $12
- JB decLast2
- AESENC T0, B0
- MOVOU (16*11)(ks), T0
- AESENC T0, B0
- MOVOU (16*12)(ks), T0
- JE decLast2
- AESENC T0, B0
- MOVOU (16*13)(ks), T0
- AESENC T0, B0
- MOVOU (16*14)(ks), T0
decLast2:
- AESENCLAST T0, B0
-
- PXOR T1, B0
- MOVOU B0, (ptx)
-
- LEAQ (16*1)(ptx), ptx
- LEAQ (16*1)(ctx), ctx
-
- JMP gcmAesDecSinglesLoop
+ AESENCLAST X11, X0
+ PXOR X12, X0
+ MOVOU X0, (SI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DX), DX
+ JMP gcmAesDecSinglesLoop
gcmAesDecTail:
+ TESTQ R9, R9
+ JE gcmAesDecDone
+ MOVQ R9, R11
+ SHLQ $0x04, R11
+ LEAQ andMask<>+0(SB), R10
+ MOVOU -16(R10)(R11*1), X12
+ MOVOU (DX), X0
+ PAND X12, X0
+ MOVOU X0, X12
+ PSHUFB X15, X0
+ PXOR X8, X0
+ MOVOU 224(DI), X8
+ MOVOU 240(DI), X10
+ MOVOU X8, X9
+ PCLMULQDQ $0x00, X0, X8
+ PCLMULQDQ $0x11, X0, X9
+ PSHUFD $0x4e, X0, X11
+ PXOR X0, X11
+ PCLMULQDQ $0x00, X11, X10
+ PXOR X8, X10
+ PXOR X9, X10
+ MOVOU X10, X11
+ PSRLDQ $0x08, X10
+ PSLLDQ $0x08, X11
+ PXOR X10, X9
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ MOVOU X14, X11
+ PCLMULQDQ $0x01, X8, X11
+ PSHUFD $0x4e, X8, X8
+ PXOR X11, X8
+ PXOR X9, X8
+ MOVOU (SP), X0
+ ADDL $0x01, R10
+ MOVL R10, R11
+ XORL R12, R11
+ BSWAPL R11
+ MOVL R11, 12(SP)
+ AESENC X1, X0
+ AESENC X2, X0
+ AESENC X3, X0
+ AESENC X4, X0
+ AESENC X5, X0
+ AESENC X6, X0
+ AESENC X7, X0
+ MOVOU 128(AX), X11
+ AESENC X11, X0
+ MOVOU 144(AX), X11
+ AESENC X11, X0
+ MOVOU 160(AX), X11
+ CMPQ R13, $0x0c
+ JB decLast3
+ AESENC X11, X0
+ MOVOU 176(AX), X11
+ AESENC X11, X0
+ MOVOU 192(AX), X11
+ JE decLast3
+ AESENC X11, X0
+ MOVOU 208(AX), X11
+ AESENC X11, X0
+ MOVOU 224(AX), X11
- TESTQ ptxLen, ptxLen
- JE gcmAesDecDone
-
- MOVQ ptxLen, aluTMP
- SHLQ $4, aluTMP
- LEAQ andMask<>(SB), aluCTR
- MOVOU -16(aluCTR)(aluTMP*1), T1
-
- MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
- PAND T1, B0
-
- MOVOU B0, T1
- PSHUFB BSWAP, B0
- PXOR ACC0, B0
-
- MOVOU (16*14)(pTbl), ACC0
- MOVOU (16*15)(pTbl), ACCM
- MOVOU ACC0, ACC1
-
- PCLMULQDQ $0x00, B0, ACC0
- PCLMULQDQ $0x11, B0, ACC1
- PSHUFD $78, B0, T0
- PXOR B0, T0
- PCLMULQDQ $0x00, T0, ACCM
-
- PXOR ACC0, ACCM
- PXOR ACC1, ACCM
- MOVOU ACCM, T0
- PSRLDQ $8, ACCM
- PSLLDQ $8, T0
- PXOR ACCM, ACC1
- PXOR T0, ACC0
-
- reduceRound(ACC0)
- reduceRound(ACC0)
- PXOR ACC1, ACC0
-
- MOVOU (0*16)(SP), B0
- increment(0)
- AESENC B1, B0
- AESENC B2, B0
- AESENC B3, B0
- AESENC B4, B0
- AESENC B5, B0
- AESENC B6, B0
- AESENC B7, B0
- MOVOU (16*8)(ks), T0
- AESENC T0, B0
- MOVOU (16*9)(ks), T0
- AESENC T0, B0
- MOVOU (16*10)(ks), T0
- CMPQ NR, $12
- JB decLast3
- AESENC T0, B0
- MOVOU (16*11)(ks), T0
- AESENC T0, B0
- MOVOU (16*12)(ks), T0
- JE decLast3
- AESENC T0, B0
- MOVOU (16*13)(ks), T0
- AESENC T0, B0
- MOVOU (16*14)(ks), T0
decLast3:
- AESENCLAST T0, B0
- PXOR T1, B0
+ AESENCLAST X11, X0
+ PXOR X12, X0
ptxStoreLoop:
- PEXTRB $0, B0, (ptx)
- PSRLDQ $1, B0
- LEAQ 1(ptx), ptx
- DECQ ptxLen
-
- JNE ptxStoreLoop
+ PEXTRB $0x00, X0, (SI)
+ PSRLDQ $0x01, X0
+ LEAQ 1(SI), SI
+ DECQ R9
+ JNE ptxStoreLoop
gcmAesDecDone:
-
- MOVOU ACC0, (tPtr)
+ MOVOU X8, (R8)
RET