From: Garrett Bodley Date: Tue, 23 Jul 2024 03:39:44 +0000 (-0400) Subject: crypto/aes: Avo port of asm_amd64.s X-Git-Tag: go1.24rc1~1007 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=d61b73c1d1a74ef3132c04e5fd389a551dd58cf9;p=gostls13.git crypto/aes: Avo port of asm_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. The reference assembly file does not specify a frame size for some of the defined assembly functions. Avo automatically infers the frame size when generating TEXT directives, leading to a diff on those lines. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="src/crypto/aes/asm_amd64.s" REFERENCE="54fe0fd43fcf8609666c16ae6d15ed92873b1564" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) 1c1 < TEXT .encryptBlockAsm(SB), NOSPLIT, $0 --- > TEXT .encryptBlockAsm(SB), NOSPLIT, $0-32 45c45 < TEXT .decryptBlockAsm(SB), NOSPLIT, $0 --- > TEXT .decryptBlockAsm(SB), NOSPLIT, $0-32 89c89 < TEXT .expandKeyAsm(SB), NOSPLIT, $0 --- > TEXT .expandKeyAsm(SB), NOSPLIT, $0-32 Change-Id: If647584df4137146d355f91ac0f6a8285d07c932 Reviewed-on: https://go-review.googlesource.com/c/go/+/600375 Reviewed-by: Dmitri Shuralyov Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda --- diff --git a/src/cmd/compile/internal/types2/stdlib_test.go b/src/cmd/compile/internal/types2/stdlib_test.go index 7a89556bb3..4d7e9b1ae0 100644 --- a/src/cmd/compile/internal/types2/stdlib_test.go +++ b/src/cmd/compile/internal/types2/stdlib_test.go @@ -356,6 +356,7 @@ var excluded = map[string]bool{ // go.dev/issue/46027: some imports are missing for this submodule. "crypto/aes/_asm/gcm": true, + "crypto/aes/_asm/standard": true, "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, "crypto/md5/_asm": true, diff --git a/src/crypto/aes/_asm/standard/asm_amd64.go b/src/crypto/aes/_asm/standard/asm_amd64.go new file mode 100644 index 0000000000..ed23e31097 --- /dev/null +++ b/src/crypto/aes/_asm/standard/asm_amd64.go @@ -0,0 +1,385 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "os" + "strings" + + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +//go:generate go run . -out ../../asm_amd64.s -pkg aes + +func main() { + Package("crypto/aes") + ConstraintExpr("!purego") + encryptBlockAsm() + decryptBlockAsm() + expandKeyAsm() + _expand_key_128() + _expand_key_192a() + _expand_key_192b() + _expand_key_256a() + _expand_key_256b() + Generate() + + var internalFunctions []string = []string{ + "·_expand_key_128<>", + "·_expand_key_192a<>", + "·_expand_key_192b<>", + "·_expand_key_256a<>", + "·_expand_key_256b<>", + } + removePeskyUnicodeDot(internalFunctions, "../../asm_amd64.s") +} + +func encryptBlockAsm() { + Implement("encryptBlockAsm") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("nr"), RCX) + Load(Param("xk"), RAX) + Load(Param("dst"), RDX) + Load(Param("src"), RBX) + MOVUPS(Mem{Base: AX}.Offset(0), X1) + MOVUPS(Mem{Base: BX}.Offset(0), X0) + ADDQ(Imm(16), RAX) + PXOR(X1, X0) + SUBQ(Imm(12), RCX) + JE(LabelRef("Lenc192")) + JB(LabelRef("Lenc128")) + + Label("Lenc256") + MOVUPS(Mem{Base: AX}.Offset(0), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(16), X1) + AESENC(X1, X0) + ADDQ(Imm(32), RAX) + + Label("Lenc192") + MOVUPS(Mem{Base: AX}.Offset(0), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(16), X1) + AESENC(X1, X0) + ADDQ(Imm(32), RAX) + + Label("Lenc128") + MOVUPS(Mem{Base: AX}.Offset(0), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(16), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(32), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(48), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(64), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(80), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(96), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(112), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(128), X1) + AESENC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(144), X1) + AESENCLAST(X1, X0) + MOVUPS(X0, Mem{Base: DX}.Offset(0)) + RET() +} + +func decryptBlockAsm() { + Implement("decryptBlockAsm") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("nr"), RCX) + Load(Param("xk"), RAX) + Load(Param("dst"), RDX) + Load(Param("src"), RBX) + + MOVUPS(Mem{Base: AX}.Offset(0), X1) + MOVUPS(Mem{Base: BX}.Offset(0), X0) + ADDQ(Imm(16), RAX) + PXOR(X1, X0) + SUBQ(Imm(12), RCX) + JE(LabelRef("Ldec192")) + JB(LabelRef("Ldec128")) + + Label("Ldec256") + MOVUPS(Mem{Base: AX}.Offset(0), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(16), X1) + AESDEC(X1, X0) + ADDQ(Imm(32), RAX) + + Label("Ldec192") + MOVUPS(Mem{Base: AX}.Offset(0), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(16), X1) + AESDEC(X1, X0) + ADDQ(Imm(32), RAX) + + Label("Ldec128") + MOVUPS(Mem{Base: AX}.Offset(0), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(16), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(32), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(48), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(64), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(80), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(96), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(112), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(128), X1) + AESDEC(X1, X0) + MOVUPS(Mem{Base: AX}.Offset(144), X1) + AESDECLAST(X1, X0) + MOVUPS(X0, Mem{Base: DX}.Offset(0)) + RET() +} + +// Note that round keys are stored in uint128 format, not uint32 +func expandKeyAsm() { + Implement("expandKeyAsm") + Attributes(NOSPLIT) + AllocLocal(0) + + Load(Param("nr"), RCX) + Load(Param("key"), RAX) + Load(Param("enc"), RBX) + Load(Param("dec"), RDX) + + MOVUPS(Mem{Base: AX}, X0) + Comment("enc") + MOVUPS(X0, Mem{Base: BX}) + ADDQ(Imm(16), RBX) + PXOR(X4, X4) // _expand_key_* expect X4 to be zero + CMPL(ECX, Imm(12)) + JE(LabelRef("Lexp_enc192")) + JB(LabelRef("Lexp_enc128")) + + Lexp_enc256() + Lexp_enc192() + Lexp_enc128() + Lexp_dec() + Lexp_dec_loop() +} + +func Lexp_enc256() { + Label("Lexp_enc256") + MOVUPS(Mem{Base: AX}.Offset(16), X2) + MOVUPS(X2, Mem{Base: BX}) + ADDQ(Imm(16), RBX) + + var rcon uint64 = 1 + for i := 0; i < 6; i++ { + AESKEYGENASSIST(Imm(rcon), X2, X1) + CALL(LabelRef("_expand_key_256a<>(SB)")) + AESKEYGENASSIST(Imm(rcon), X0, X1) + CALL(LabelRef("_expand_key_256b<>(SB)")) + rcon <<= 1 + } + AESKEYGENASSIST(Imm(0x40), X2, X1) + CALL(LabelRef("_expand_key_256a<>(SB)")) + JMP(LabelRef("Lexp_dec")) +} + +func Lexp_enc192() { + Label("Lexp_enc192") + MOVQ(Mem{Base: AX}.Offset(16), X2) + + var rcon uint64 = 1 + for i := 0; i < 8; i++ { + AESKEYGENASSIST(Imm(rcon), X2, X1) + if i%2 == 0 { + CALL(LabelRef("_expand_key_192a<>(SB)")) + } else { + CALL(LabelRef("_expand_key_192b<>(SB)")) + } + rcon <<= 1 + } + JMP(LabelRef("Lexp_dec")) +} + +func Lexp_enc128() { + Label("Lexp_enc128") + var rcon uint64 = 1 + for i := 0; i < 8; i++ { + AESKEYGENASSIST(Imm(rcon), X0, X1) + CALL(LabelRef("_expand_key_128<>(SB)")) + rcon <<= 1 + } + AESKEYGENASSIST(Imm(0x1b), X0, X1) + CALL(LabelRef("_expand_key_128<>(SB)")) + AESKEYGENASSIST(Imm(0x36), X0, X1) + CALL(LabelRef("_expand_key_128<>(SB)")) +} + +func Lexp_dec() { + Label("Lexp_dec") + Comment("dec") + SUBQ(Imm(16), RBX) + MOVUPS(Mem{Base: BX}, X1) + MOVUPS(X1, Mem{Base: DX}) + DECQ(RCX) +} + +func Lexp_dec_loop() { + Label("Lexp_dec_loop") + MOVUPS(Mem{Base: BX}.Offset(-16), X1) + AESIMC(X1, X0) + MOVUPS(X0, Mem{Base: DX}.Offset(16)) + SUBQ(Imm(16), RBX) + ADDQ(Imm(16), RDX) + DECQ(RCX) + JNZ(LabelRef("Lexp_dec_loop")) + MOVUPS(Mem{Base: BX}.Offset(-16), X0) + MOVUPS(X0, Mem{Base: DX}.Offset(16)) + RET() +} + +func _expand_key_128() { + Function("_expand_key_128<>") + Attributes(NOSPLIT) + AllocLocal(0) + + PSHUFD(Imm(0xff), X1, X1) + SHUFPS(Imm(0x10), X0, X4) + PXOR(X4, X0) + SHUFPS(Imm(0x8c), X0, X4) + PXOR(X4, X0) + PXOR(X1, X0) + MOVUPS(X0, Mem{Base: BX}) + ADDQ(Imm(16), RBX) + RET() +} + +func _expand_key_192a() { + Function("_expand_key_192a<>") + Attributes(NOSPLIT) + AllocLocal(0) + + PSHUFD(Imm(0x55), X1, X1) + SHUFPS(Imm(0x10), X0, X4) + PXOR(X4, X0) + SHUFPS(Imm(0x8c), X0, X4) + PXOR(X4, X0) + PXOR(X1, X0) + + MOVAPS(X2, X5) + MOVAPS(X2, X6) + PSLLDQ(Imm(0x4), X5) + PSHUFD(Imm(0xff), X0, X3) + PXOR(X3, X2) + PXOR(X5, X2) + + MOVAPS(X0, X1) + SHUFPS(Imm(0x44), X0, X6) + MOVUPS(X6, Mem{Base: BX}) + SHUFPS(Imm(0x4e), X2, X1) + MOVUPS(X1, Mem{Base: BX}.Offset(16)) + ADDQ(Imm(32), RBX) + RET() +} + +func _expand_key_192b() { + Function("_expand_key_192b<>") + Attributes(NOSPLIT) + AllocLocal(0) + + PSHUFD(Imm(0x55), X1, X1) + SHUFPS(Imm(0x10), X0, X4) + PXOR(X4, X0) + SHUFPS(Imm(0x8c), X0, X4) + PXOR(X4, X0) + PXOR(X1, X0) + + MOVAPS(X2, X5) + PSLLDQ(Imm(0x4), X5) + PSHUFD(Imm(0xff), X0, X3) + PXOR(X3, X2) + PXOR(X5, X2) + + MOVUPS(X0, Mem{Base: BX}) + ADDQ(Imm(16), RBX) + RET() +} + +func _expand_key_256a() { + Function("_expand_key_256a<>") + Attributes(NOSPLIT) + AllocLocal(0) + + // Hack to get Avo to emit: + // JMP _expand_key_128<>(SB) + Instruction(&ir.Instruction{ + Opcode: "JMP", + Operands: []Op{ + LabelRef("_expand_key_128<>(SB)"), + }, + }) +} + +func _expand_key_256b() { + Function("_expand_key_256b<>") + Attributes(NOSPLIT) + AllocLocal(0) + + PSHUFD(Imm(0xaa), X1, X1) + SHUFPS(Imm(0x10), X2, X4) + PXOR(X4, X2) + SHUFPS(Imm(0x8c), X2, X4) + PXOR(X4, X2) + PXOR(X1, X2) + + MOVUPS(X2, Mem{Base: BX}) + ADDQ(Imm(16), RBX) + RET() +} + +const ThatPeskyUnicodeDot = "\u00b7" + +// removePeskyUnicodeDot strips the dot from the relevant TEXT directives such that they +// can exist as internal assembly functions +// +// Avo v0.6.0 does not support the generation of internal assembly functions. Go's unicode +// dot tells the compiler to link a TEXT symbol to a function in the current Go package +// (or another package if specified). Avo unconditionally prepends the unicode dot to all +// TEXT symbols, making it impossible to emit an internal function without this hack. +// +// There is a pending PR to add internal functions to Avo: +// https://github.com/mmcloughlin/avo/pull/443 +// +// If merged it should allow the usage of InternalFunction("NAME") for the specified functions +func removePeskyUnicodeDot(internalFunctions []string, target string) { + bytes, err := os.ReadFile(target) + if err != nil { + panic(err) + } + + content := string(bytes) + + for _, from := range internalFunctions { + to := strings.ReplaceAll(from, ThatPeskyUnicodeDot, "") + content = strings.ReplaceAll(content, from, to) + } + + err = os.WriteFile(target, []byte(content), 0644) + if err != nil { + panic(err) + } +} diff --git a/src/crypto/aes/_asm/standard/go.mod b/src/crypto/aes/_asm/standard/go.mod new file mode 100644 index 0000000000..f9382a9780 --- /dev/null +++ b/src/crypto/aes/_asm/standard/go.mod @@ -0,0 +1,11 @@ +module std/crypto/aes/_asm/standard + +go 1.24 + +require github.com/mmcloughlin/avo v0.6.0 + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/src/crypto/aes/_asm/standard/go.sum b/src/crypto/aes/_asm/standard/go.sum new file mode 100644 index 0000000000..76af484b2e --- /dev/null +++ b/src/crypto/aes/_asm/standard/go.sum @@ -0,0 +1,8 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/src/crypto/aes/asm_amd64.s b/src/crypto/aes/asm_amd64.s index d5e17401ea..d88ccbf765 100644 --- a/src/crypto/aes/asm_amd64.s +++ b/src/crypto/aes/asm_amd64.s @@ -1,276 +1,286 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run asm_amd64.go -out ../../asm_amd64.s -pkg aes. DO NOT EDIT. //go:build !purego #include "textflag.h" -// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte) -TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 - MOVQ nr+0(FP), CX - MOVQ xk+8(FP), AX - MOVQ dst+16(FP), DX - MOVQ src+24(FP), BX - MOVUPS 0(AX), X1 - MOVUPS 0(BX), X0 - ADDQ $16, AX - PXOR X1, X0 - SUBQ $12, CX - JE Lenc192 - JB Lenc128 -Lenc256: - MOVUPS 0(AX), X1 +// func encryptBlockAsm(nr int, xk *uint32, dst *byte, src *byte) +// Requires: AES, SSE, SSE2 +TEXT ·encryptBlockAsm(SB), NOSPLIT, $0-32 + MOVQ nr+0(FP), CX + MOVQ xk+8(FP), AX + MOVQ dst+16(FP), DX + MOVQ src+24(FP), BX + MOVUPS (AX), X1 + MOVUPS (BX), X0 + ADDQ $0x10, AX + PXOR X1, X0 + SUBQ $0x0c, CX + JE Lenc192 + JB Lenc128 + MOVUPS (AX), X1 AESENC X1, X0 MOVUPS 16(AX), X1 AESENC X1, X0 - ADDQ $32, AX + ADDQ $0x20, AX + Lenc192: - MOVUPS 0(AX), X1 + MOVUPS (AX), X1 AESENC X1, X0 MOVUPS 16(AX), X1 AESENC X1, X0 - ADDQ $32, AX + ADDQ $0x20, AX + Lenc128: - MOVUPS 0(AX), X1 - AESENC X1, X0 - MOVUPS 16(AX), X1 - AESENC X1, X0 - MOVUPS 32(AX), X1 - AESENC X1, X0 - MOVUPS 48(AX), X1 - AESENC X1, X0 - MOVUPS 64(AX), X1 - AESENC X1, X0 - MOVUPS 80(AX), X1 - AESENC X1, X0 - MOVUPS 96(AX), X1 - AESENC X1, X0 - MOVUPS 112(AX), X1 - AESENC X1, X0 - MOVUPS 128(AX), X1 - AESENC X1, X0 - MOVUPS 144(AX), X1 + MOVUPS (AX), X1 + AESENC X1, X0 + MOVUPS 16(AX), X1 + AESENC X1, X0 + MOVUPS 32(AX), X1 + AESENC X1, X0 + MOVUPS 48(AX), X1 + AESENC X1, X0 + MOVUPS 64(AX), X1 + AESENC X1, X0 + MOVUPS 80(AX), X1 + AESENC X1, X0 + MOVUPS 96(AX), X1 + AESENC X1, X0 + MOVUPS 112(AX), X1 + AESENC X1, X0 + MOVUPS 128(AX), X1 + AESENC X1, X0 + MOVUPS 144(AX), X1 AESENCLAST X1, X0 - MOVUPS X0, 0(DX) + MOVUPS X0, (DX) RET -// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte) -TEXT ·decryptBlockAsm(SB),NOSPLIT,$0 - MOVQ nr+0(FP), CX - MOVQ xk+8(FP), AX - MOVQ dst+16(FP), DX - MOVQ src+24(FP), BX - MOVUPS 0(AX), X1 - MOVUPS 0(BX), X0 - ADDQ $16, AX - PXOR X1, X0 - SUBQ $12, CX - JE Ldec192 - JB Ldec128 -Ldec256: - MOVUPS 0(AX), X1 +// func decryptBlockAsm(nr int, xk *uint32, dst *byte, src *byte) +// Requires: AES, SSE, SSE2 +TEXT ·decryptBlockAsm(SB), NOSPLIT, $0-32 + MOVQ nr+0(FP), CX + MOVQ xk+8(FP), AX + MOVQ dst+16(FP), DX + MOVQ src+24(FP), BX + MOVUPS (AX), X1 + MOVUPS (BX), X0 + ADDQ $0x10, AX + PXOR X1, X0 + SUBQ $0x0c, CX + JE Ldec192 + JB Ldec128 + MOVUPS (AX), X1 AESDEC X1, X0 MOVUPS 16(AX), X1 AESDEC X1, X0 - ADDQ $32, AX + ADDQ $0x20, AX + Ldec192: - MOVUPS 0(AX), X1 + MOVUPS (AX), X1 AESDEC X1, X0 MOVUPS 16(AX), X1 AESDEC X1, X0 - ADDQ $32, AX + ADDQ $0x20, AX + Ldec128: - MOVUPS 0(AX), X1 - AESDEC X1, X0 - MOVUPS 16(AX), X1 - AESDEC X1, X0 - MOVUPS 32(AX), X1 - AESDEC X1, X0 - MOVUPS 48(AX), X1 - AESDEC X1, X0 - MOVUPS 64(AX), X1 - AESDEC X1, X0 - MOVUPS 80(AX), X1 - AESDEC X1, X0 - MOVUPS 96(AX), X1 - AESDEC X1, X0 - MOVUPS 112(AX), X1 - AESDEC X1, X0 - MOVUPS 128(AX), X1 - AESDEC X1, X0 - MOVUPS 144(AX), X1 + MOVUPS (AX), X1 + AESDEC X1, X0 + MOVUPS 16(AX), X1 + AESDEC X1, X0 + MOVUPS 32(AX), X1 + AESDEC X1, X0 + MOVUPS 48(AX), X1 + AESDEC X1, X0 + MOVUPS 64(AX), X1 + AESDEC X1, X0 + MOVUPS 80(AX), X1 + AESDEC X1, X0 + MOVUPS 96(AX), X1 + AESDEC X1, X0 + MOVUPS 112(AX), X1 + AESDEC X1, X0 + MOVUPS 128(AX), X1 + AESDEC X1, X0 + MOVUPS 144(AX), X1 AESDECLAST X1, X0 - MOVUPS X0, 0(DX) + MOVUPS X0, (DX) RET -// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) { -// Note that round keys are stored in uint128 format, not uint32 -TEXT ·expandKeyAsm(SB),NOSPLIT,$0 - MOVQ nr+0(FP), CX - MOVQ key+8(FP), AX - MOVQ enc+16(FP), BX - MOVQ dec+24(FP), DX +// func expandKeyAsm(nr int, key *byte, enc *uint32, dec *uint32) +// Requires: AES, SSE, SSE2 +TEXT ·expandKeyAsm(SB), NOSPLIT, $0-32 + MOVQ nr+0(FP), CX + MOVQ key+8(FP), AX + MOVQ enc+16(FP), BX + MOVQ dec+24(FP), DX MOVUPS (AX), X0 + // enc - MOVUPS X0, (BX) - ADDQ $16, BX - PXOR X4, X4 // _expand_key_* expect X4 to be zero - CMPL CX, $12 - JE Lexp_enc192 - JB Lexp_enc128 -Lexp_enc256: - MOVUPS 16(AX), X2 - MOVUPS X2, (BX) - ADDQ $16, BX + MOVUPS X0, (BX) + ADDQ $0x10, BX + PXOR X4, X4 + CMPL CX, $0x0c + JE Lexp_enc192 + JB Lexp_enc128 + MOVUPS 16(AX), X2 + MOVUPS X2, (BX) + ADDQ $0x10, BX AESKEYGENASSIST $0x01, X2, X1 - CALL _expand_key_256a<>(SB) + CALL _expand_key_256a<>(SB) AESKEYGENASSIST $0x01, X0, X1 - CALL _expand_key_256b<>(SB) + CALL _expand_key_256b<>(SB) AESKEYGENASSIST $0x02, X2, X1 - CALL _expand_key_256a<>(SB) + CALL _expand_key_256a<>(SB) AESKEYGENASSIST $0x02, X0, X1 - CALL _expand_key_256b<>(SB) + CALL _expand_key_256b<>(SB) AESKEYGENASSIST $0x04, X2, X1 - CALL _expand_key_256a<>(SB) + CALL _expand_key_256a<>(SB) AESKEYGENASSIST $0x04, X0, X1 - CALL _expand_key_256b<>(SB) + CALL _expand_key_256b<>(SB) AESKEYGENASSIST $0x08, X2, X1 - CALL _expand_key_256a<>(SB) + CALL _expand_key_256a<>(SB) AESKEYGENASSIST $0x08, X0, X1 - CALL _expand_key_256b<>(SB) + CALL _expand_key_256b<>(SB) AESKEYGENASSIST $0x10, X2, X1 - CALL _expand_key_256a<>(SB) + CALL _expand_key_256a<>(SB) AESKEYGENASSIST $0x10, X0, X1 - CALL _expand_key_256b<>(SB) + CALL _expand_key_256b<>(SB) AESKEYGENASSIST $0x20, X2, X1 - CALL _expand_key_256a<>(SB) + CALL _expand_key_256a<>(SB) AESKEYGENASSIST $0x20, X0, X1 - CALL _expand_key_256b<>(SB) + CALL _expand_key_256b<>(SB) AESKEYGENASSIST $0x40, X2, X1 - CALL _expand_key_256a<>(SB) - JMP Lexp_dec + CALL _expand_key_256a<>(SB) + JMP Lexp_dec + Lexp_enc192: - MOVQ 16(AX), X2 + MOVQ 16(AX), X2 AESKEYGENASSIST $0x01, X2, X1 - CALL _expand_key_192a<>(SB) + CALL _expand_key_192a<>(SB) AESKEYGENASSIST $0x02, X2, X1 - CALL _expand_key_192b<>(SB) + CALL _expand_key_192b<>(SB) AESKEYGENASSIST $0x04, X2, X1 - CALL _expand_key_192a<>(SB) + CALL _expand_key_192a<>(SB) AESKEYGENASSIST $0x08, X2, X1 - CALL _expand_key_192b<>(SB) + CALL _expand_key_192b<>(SB) AESKEYGENASSIST $0x10, X2, X1 - CALL _expand_key_192a<>(SB) + CALL _expand_key_192a<>(SB) AESKEYGENASSIST $0x20, X2, X1 - CALL _expand_key_192b<>(SB) + CALL _expand_key_192b<>(SB) AESKEYGENASSIST $0x40, X2, X1 - CALL _expand_key_192a<>(SB) + CALL _expand_key_192a<>(SB) AESKEYGENASSIST $0x80, X2, X1 - CALL _expand_key_192b<>(SB) - JMP Lexp_dec + CALL _expand_key_192b<>(SB) + JMP Lexp_dec + Lexp_enc128: AESKEYGENASSIST $0x01, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x02, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x04, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x08, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x10, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x20, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x40, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x80, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x1b, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) AESKEYGENASSIST $0x36, X0, X1 - CALL _expand_key_128<>(SB) + CALL _expand_key_128<>(SB) + Lexp_dec: // dec - SUBQ $16, BX + SUBQ $0x10, BX MOVUPS (BX), X1 MOVUPS X1, (DX) - DECQ CX + DECQ CX + Lexp_dec_loop: MOVUPS -16(BX), X1 AESIMC X1, X0 MOVUPS X0, 16(DX) - SUBQ $16, BX - ADDQ $16, DX - DECQ CX - JNZ Lexp_dec_loop + SUBQ $0x10, BX + ADDQ $0x10, DX + DECQ CX + JNZ Lexp_dec_loop MOVUPS -16(BX), X0 MOVUPS X0, 16(DX) RET -TEXT _expand_key_128<>(SB),NOSPLIT,$0 +// func _expand_key_128<>() +// Requires: SSE, SSE2 +TEXT _expand_key_128<>(SB), NOSPLIT, $0 PSHUFD $0xff, X1, X1 SHUFPS $0x10, X0, X4 - PXOR X4, X0 + PXOR X4, X0 SHUFPS $0x8c, X0, X4 - PXOR X4, X0 - PXOR X1, X0 + PXOR X4, X0 + PXOR X1, X0 MOVUPS X0, (BX) - ADDQ $16, BX + ADDQ $0x10, BX RET -TEXT _expand_key_192a<>(SB),NOSPLIT,$0 +// func _expand_key_192a<>() +// Requires: SSE, SSE2 +TEXT _expand_key_192a<>(SB), NOSPLIT, $0 PSHUFD $0x55, X1, X1 SHUFPS $0x10, X0, X4 - PXOR X4, X0 + PXOR X4, X0 SHUFPS $0x8c, X0, X4 - PXOR X4, X0 - PXOR X1, X0 - + PXOR X4, X0 + PXOR X1, X0 MOVAPS X2, X5 MOVAPS X2, X6 - PSLLDQ $0x4, X5 + PSLLDQ $0x04, X5 PSHUFD $0xff, X0, X3 - PXOR X3, X2 - PXOR X5, X2 - + PXOR X3, X2 + PXOR X5, X2 MOVAPS X0, X1 SHUFPS $0x44, X0, X6 MOVUPS X6, (BX) SHUFPS $0x4e, X2, X1 MOVUPS X1, 16(BX) - ADDQ $32, BX + ADDQ $0x20, BX RET -TEXT _expand_key_192b<>(SB),NOSPLIT,$0 +// func _expand_key_192b<>() +// Requires: SSE, SSE2 +TEXT _expand_key_192b<>(SB), NOSPLIT, $0 PSHUFD $0x55, X1, X1 SHUFPS $0x10, X0, X4 - PXOR X4, X0 + PXOR X4, X0 SHUFPS $0x8c, X0, X4 - PXOR X4, X0 - PXOR X1, X0 - + PXOR X4, X0 + PXOR X1, X0 MOVAPS X2, X5 - PSLLDQ $0x4, X5 + PSLLDQ $0x04, X5 PSHUFD $0xff, X0, X3 - PXOR X3, X2 - PXOR X5, X2 - + PXOR X3, X2 + PXOR X5, X2 MOVUPS X0, (BX) - ADDQ $16, BX + ADDQ $0x10, BX RET -TEXT _expand_key_256a<>(SB),NOSPLIT,$0 +// func _expand_key_256a<>() +TEXT _expand_key_256a<>(SB), NOSPLIT, $0 JMP _expand_key_128<>(SB) -TEXT _expand_key_256b<>(SB),NOSPLIT,$0 +// func _expand_key_256b<>() +// Requires: SSE, SSE2 +TEXT _expand_key_256b<>(SB), NOSPLIT, $0 PSHUFD $0xaa, X1, X1 SHUFPS $0x10, X2, X4 - PXOR X4, X2 + PXOR X4, X2 SHUFPS $0x8c, X2, X4 - PXOR X4, X2 - PXOR X1, X2 - + PXOR X4, X2 + PXOR X1, X2 MOVUPS X2, (BX) - ADDQ $16, BX + ADDQ $0x10, BX RET diff --git a/src/go/types/stdlib_test.go b/src/go/types/stdlib_test.go index c98d67e114..549eeba8f3 100644 --- a/src/go/types/stdlib_test.go +++ b/src/go/types/stdlib_test.go @@ -358,6 +358,7 @@ var excluded = map[string]bool{ // See go.dev/issue/46027: some imports are missing for this submodule. "crypto/aes/_asm/gcm": true, + "crypto/aes/_asm/standard": true, "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, "crypto/md5/_asm": true,