From: Shenghou Ma Date: Wed, 26 Sep 2012 17:54:10 +0000 (+0800) Subject: crypto/aes: speed up using AES-NI on amd64 X-Git-Tag: go1.1rc2~2313 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=948db4e0919c7ae5553f5ba727bdae1d19fbf8c0;p=gostls13.git crypto/aes: speed up using AES-NI on amd64 This CL requires CL 5970055. benchmark old ns/op new ns/op delta BenchmarkEncrypt 161 23 -85.71% BenchmarkDecrypt 158 24 -84.24% BenchmarkExpand 526 62 -88.21% benchmark old MB/s new MB/s speedup BenchmarkEncrypt 99.32 696.19 7.01x BenchmarkDecrypt 100.93 641.56 6.36x R=golang-dev, bradfitz, dave, rsc CC=golang-dev https://golang.org/cl/6549055 --- diff --git a/src/pkg/crypto/aes/aes_test.go b/src/pkg/crypto/aes/aes_test.go index c30ccf3f18..6261dd09fb 100644 --- a/src/pkg/crypto/aes/aes_test.go +++ b/src/pkg/crypto/aes/aes_test.go @@ -221,7 +221,10 @@ L: if tt.dec != nil { dec = make([]uint32, len(tt.dec)) } - expandKey(tt.key, enc, dec) + // This test could only test Go version of expandKey because asm + // version might use different memory layout for expanded keys + // This is OK because we don't expose expanded keys to the outside + expandKeyGo(tt.key, enc, dec) for j, v := range enc { if v != tt.enc[j] { t.Errorf("key %d: enc[%d] = %#x, want %#x", i, j, v, tt.enc[j]) diff --git a/src/pkg/crypto/aes/asm_amd64.s b/src/pkg/crypto/aes/asm_amd64.s new file mode 100644 index 0000000000..25decf9785 --- /dev/null +++ b/src/pkg/crypto/aes/asm_amd64.s @@ -0,0 +1,287 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// func hasAsm() bool +// returns whether AES-NI is supported +TEXT ·hasAsm(SB),7,$0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $25, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte) +TEXT ·encryptBlockAsm(SB),7,$0 + MOVQ nr+0(FP), CX + MOVQ xk+8(FP), AX + MOVQ dst+16(FP), DX + MOVQ src+24(FP), BX + MOVUPS 0(AX), X1 + MOVUPS 0(BX), X0 + ADDQ $16, AX + PXOR X1, X0 + SUBQ $12, CX + JE Lenc196 + JB Lenc128 +Lenc256: + MOVUPS 0(AX), X1 + AESENC X1, X0 + MOVUPS 16(AX), X1 + AESENC X1, X0 + ADDQ $32, AX +Lenc196: + MOVUPS 0(AX), X1 + AESENC X1, X0 + MOVUPS 16(AX), X1 + AESENC X1, X0 + ADDQ $32, AX +Lenc128: + MOVUPS 0(AX), X1 + AESENC X1, X0 + MOVUPS 16(AX), X1 + AESENC X1, X0 + MOVUPS 32(AX), X1 + AESENC X1, X0 + MOVUPS 48(AX), X1 + AESENC X1, X0 + MOVUPS 64(AX), X1 + AESENC X1, X0 + MOVUPS 80(AX), X1 + AESENC X1, X0 + MOVUPS 96(AX), X1 + AESENC X1, X0 + MOVUPS 112(AX), X1 + AESENC X1, X0 + MOVUPS 128(AX), X1 + AESENC X1, X0 + MOVUPS 144(AX), X1 + AESENCLAST X1, X0 + MOVUPS X0, 0(DX) + RET + +// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte) +TEXT ·decryptBlockAsm(SB),7,$0 + MOVQ nr+0(FP), CX + MOVQ xk+8(FP), AX + MOVQ dst+16(FP), DX + MOVQ src+24(FP), BX + MOVUPS 0(AX), X1 + MOVUPS 0(BX), X0 + ADDQ $16, AX + PXOR X1, X0 + SUBQ $12, CX + JE Ldec196 + JB Ldec128 +Ldec256: + MOVUPS 0(AX), X1 + AESDEC X1, X0 + MOVUPS 16(AX), X1 + AESDEC X1, X0 + ADDQ $32, AX +Ldec196: + MOVUPS 0(AX), X1 + AESDEC X1, X0 + MOVUPS 16(AX), X1 + AESDEC X1, X0 + ADDQ $32, AX +Ldec128: + MOVUPS 0(AX), X1 + AESDEC X1, X0 + MOVUPS 16(AX), X1 + AESDEC X1, X0 + MOVUPS 32(AX), X1 + AESDEC X1, X0 + MOVUPS 48(AX), X1 + AESDEC X1, X0 + MOVUPS 64(AX), X1 + AESDEC X1, X0 + MOVUPS 80(AX), X1 + AESDEC X1, X0 + MOVUPS 96(AX), X1 + AESDEC X1, X0 + MOVUPS 112(AX), X1 + AESDEC X1, X0 + MOVUPS 128(AX), X1 + AESDEC X1, X0 + MOVUPS 144(AX), X1 + AESDECLAST X1, X0 + MOVUPS X0, 0(DX) + RET + +// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) { +// Note that round keys are stored in uint128 format, not uint32 +TEXT ·expandKeyAsm(SB),7,$0 + MOVQ nr+0(FP), CX + MOVQ key+8(FP), AX + MOVQ enc+16(FP), BX + MOVQ dec+24(FP), DX + MOVUPS (AX), X0 + // enc + MOVUPS X0, (BX) + ADDQ $16, BX + PXOR X4, X4 // _expand_key_* expect X4 to be zero + CMPL CX, $12 + JE Lexp_enc196 + JB Lexp_enc128 +Lexp_enc256: + MOVUPS 16(AX), X2 + MOVUPS X2, (BX) + ADDQ $16, BX + AESKEYGENASSIST $0x01, X2, X1 + CALL _expand_key_256a<>(SB) + AESKEYGENASSIST $0x01, X0, X1 + CALL _expand_key_256b<>(SB) + AESKEYGENASSIST $0x02, X2, X1 + CALL _expand_key_256a<>(SB) + AESKEYGENASSIST $0x02, X0, X1 + CALL _expand_key_256b<>(SB) + AESKEYGENASSIST $0x04, X2, X1 + CALL _expand_key_256a<>(SB) + AESKEYGENASSIST $0x04, X0, X1 + CALL _expand_key_256b<>(SB) + AESKEYGENASSIST $0x08, X2, X1 + CALL _expand_key_256a<>(SB) + AESKEYGENASSIST $0x08, X0, X1 + CALL _expand_key_256b<>(SB) + AESKEYGENASSIST $0x10, X2, X1 + CALL _expand_key_256a<>(SB) + AESKEYGENASSIST $0x10, X0, X1 + CALL _expand_key_256b<>(SB) + AESKEYGENASSIST $0x20, X2, X1 + CALL _expand_key_256a<>(SB) + AESKEYGENASSIST $0x20, X0, X1 + CALL _expand_key_256b<>(SB) + AESKEYGENASSIST $0x40, X2, X1 + CALL _expand_key_256a<>(SB) + JMP Lexp_dec +Lexp_enc196: + MOVQ 16(AX), X2 + AESKEYGENASSIST $0x01, X2, X1 + CALL _expand_key_192a<>(SB) + AESKEYGENASSIST $0x02, X2, X1 + CALL _expand_key_192b<>(SB) + AESKEYGENASSIST $0x04, X2, X1 + CALL _expand_key_192a<>(SB) + AESKEYGENASSIST $0x08, X2, X1 + CALL _expand_key_192b<>(SB) + AESKEYGENASSIST $0x10, X2, X1 + CALL _expand_key_192a<>(SB) + AESKEYGENASSIST $0x20, X2, X1 + CALL _expand_key_192b<>(SB) + AESKEYGENASSIST $0x40, X2, X1 + CALL _expand_key_192a<>(SB) + AESKEYGENASSIST $0x80, X2, X1 + CALL _expand_key_192b<>(SB) + JMP Lexp_dec +Lexp_enc128: + AESKEYGENASSIST $0x01, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x02, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x04, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x08, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x10, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x20, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x40, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x80, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x1b, X0, X1 + CALL _expand_key_128<>(SB) + AESKEYGENASSIST $0x36, X0, X1 + CALL _expand_key_128<>(SB) +Lexp_dec: + // dec + SUBQ $16, BX + MOVUPS (BX), X1 + MOVUPS X1, (DX) + DECQ CX +Lexp_dec_loop: + MOVUPS -16(BX), X1 + AESIMC X1, X0 + MOVUPS X0, 16(DX) + SUBQ $16, BX + ADDQ $16, DX + DECQ CX + JNZ Lexp_dec_loop + MOVUPS -16(BX), X0 + MOVUPS X0, 16(DX) + RET + +#define PSHUFD_X0_X0_ BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xc0 +#define PSHUFD_X1_X1_ BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xc9 +TEXT _expand_key_128<>(SB),7,$0 + PSHUFD $0xff, X1, X1 + SHUFPS $0x10, X0, X4 + PXOR X4, X0 + SHUFPS $0x8c, X0, X4 + PXOR X4, X0 + PXOR X1, X0 + MOVUPS X0, (BX) + ADDQ $16, BX + RET + +#define PSLLDQ_X5_ BYTE $0x66; BYTE $0x0f; BYTE $0x73; BYTE $0xfd +#define PSHUFD_X0_X3_ BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xd8 +TEXT _expand_key_192a<>(SB),7,$0 + PSHUFD $0x55, X1, X1 + SHUFPS $0x10, X0, X4 + PXOR X4, X0 + SHUFPS $0x8c, X0, X4 + PXOR X4, X0 + PXOR X1, X0 + + MOVAPS X2, X5 + MOVAPS X2, X6 + PSLLDQ_X5_; BYTE $0x4 + PSHUFD $0xff, X0, X3 + PXOR X3, X2 + PXOR X5, X2 + + MOVAPS X0, X1 + SHUFPS $0x44, X0, X6 + MOVUPS X6, (BX) + SHUFPS $0x4e, X2, X1 + MOVUPS X1, 16(BX) + ADDQ $32, BX + RET + +TEXT _expand_key_192b<>(SB),7,$0 + PSHUFD $0x55, X1, X1 + SHUFPS $0x10, X0, X4 + PXOR X4, X0 + SHUFPS $0x8c, X0, X4 + PXOR X4, X0 + PXOR X1, X0 + + MOVAPS X2, X5 + PSLLDQ_X5_; BYTE $0x4 + PSHUFD $0xff, X0, X3 + PXOR X3, X2 + PXOR X5, X2 + + MOVUPS X0, (BX) + ADDQ $16, BX + RET + +TEXT _expand_key_256a<>(SB),7,$0 + JMP _expand_key_128<>(SB) + +TEXT _expand_key_256b<>(SB),7,$0 + PSHUFD $0xaa, X1, X1 + SHUFPS $0x10, X2, X4 + PXOR X4, X2 + SHUFPS $0x8c, X2, X4 + PXOR X4, X2 + PXOR X1, X2 + + MOVUPS X2, (BX) + ADDQ $16, BX + RET diff --git a/src/pkg/crypto/aes/block.go b/src/pkg/crypto/aes/block.go index b930787cec..57a7e9e25f 100644 --- a/src/pkg/crypto/aes/block.go +++ b/src/pkg/crypto/aes/block.go @@ -37,7 +37,7 @@ package aes // Encrypt one block from src into dst, using the expanded key xk. -func encryptBlock(xk []uint32, dst, src []byte) { +func encryptBlockGo(xk []uint32, dst, src []byte) { var s0, s1, s2, s3, t0, t1, t2, t3 uint32 s0 = uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) @@ -82,7 +82,7 @@ func encryptBlock(xk []uint32, dst, src []byte) { } // Decrypt one block from src into dst, using the expanded key xk. -func decryptBlock(xk []uint32, dst, src []byte) { +func decryptBlockGo(xk []uint32, dst, src []byte) { var s0, s1, s2, s3, t0, t1, t2, t3 uint32 s0 = uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) @@ -139,7 +139,7 @@ func rotw(w uint32) uint32 { return w<<8 | w>>24 } // Key expansion algorithm. See FIPS-197, Figure 11. // Their rcon[i] is our powx[i-1] << 24. -func expandKey(key []byte, enc, dec []uint32) { +func expandKeyGo(key []byte, enc, dec []uint32) { // Encryption key setup. var i int nk := len(key) / 4 diff --git a/src/pkg/crypto/aes/cipher.go b/src/pkg/crypto/aes/cipher.go index 7d307c93a0..d931134a70 100644 --- a/src/pkg/crypto/aes/cipher.go +++ b/src/pkg/crypto/aes/cipher.go @@ -45,6 +45,10 @@ func NewCipher(key []byte) (cipher.Block, error) { func (c *aesCipher) BlockSize() int { return BlockSize } -func (c *aesCipher) Encrypt(dst, src []byte) { encryptBlock(c.enc, dst, src) } +func (c *aesCipher) Encrypt(dst, src []byte) { + encryptBlock(c.enc, dst, src) +} -func (c *aesCipher) Decrypt(dst, src []byte) { decryptBlock(c.dec, dst, src) } +func (c *aesCipher) Decrypt(dst, src []byte) { + decryptBlock(c.dec, dst, src) +} diff --git a/src/pkg/crypto/aes/cipher_asm.go b/src/pkg/crypto/aes/cipher_asm.go new file mode 100644 index 0000000000..21369fc382 --- /dev/null +++ b/src/pkg/crypto/aes/cipher_asm.go @@ -0,0 +1,46 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 + +package aes + +// defined in asm_$GOARCH.s +func hasAsm() bool +func encryptBlockAsm(nr int, xk *uint32, dst, src *byte) +func decryptBlockAsm(nr int, xk *uint32, dst, src *byte) +func expandKeyAsm(nr int, key *byte, enc *uint32, dec *uint32) + +var useAsm = hasAsm() + +func encryptBlock(xk []uint32, dst, src []byte) { + if useAsm { + encryptBlockAsm(len(xk)/4-1, &xk[0], &dst[0], &src[0]) + } else { + encryptBlockGo(xk, dst, src) + } +} +func decryptBlock(xk []uint32, dst, src []byte) { + if useAsm { + decryptBlockAsm(len(xk)/4-1, &xk[0], &dst[0], &src[0]) + } else { + decryptBlockGo(xk, dst, src) + } +} +func expandKey(key []byte, enc, dec []uint32) { + if useAsm { + rounds := 10 + switch len(key) { + case 128 / 8: + rounds = 10 + case 192 / 8: + rounds = 12 + case 256 / 8: + rounds = 14 + } + expandKeyAsm(rounds, &key[0], &enc[0], &dec[0]) + } else { + expandKeyGo(key, enc, dec) + } +} diff --git a/src/pkg/crypto/aes/cipher_generic.go b/src/pkg/crypto/aes/cipher_generic.go new file mode 100644 index 0000000000..1714e0f1e5 --- /dev/null +++ b/src/pkg/crypto/aes/cipher_generic.go @@ -0,0 +1,19 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 + +package aes + +func encryptBlock(xk []uint32, dst, src []byte) { + encryptBlockGo(xk, dst, src) +} + +func decryptBlock(xk []uint32, dst, src []byte) { + decryptBlockGo(xk, dst, src) +} + +func expandKey(key []byte, enc, dec []uint32) { + expandKeyGo(key, enc, dec) +}