-- amd64 --
On a MacBookPro10,2 (Core i5):
benchmark old ns/op new ns/op delta
BenchmarkHash8Bytes 471 524 +11.25%
BenchmarkHash1K 3018 2220 -26.44%
BenchmarkHash8K 20634 14604 -29.22%
BenchmarkHash8BytesUnaligned 468 523 +11.75%
BenchmarkHash1KUnaligned 3006 2212 -26.41%
BenchmarkHash8KUnaligned 20820 14652 -29.63%
benchmark old MB/s new MB/s speedup
BenchmarkHash8Bytes 16.98 15.26 0.90x
BenchmarkHash1K 339.26 461.19 1.36x
BenchmarkHash8K 397.00 560.92 1.41x
BenchmarkHash8BytesUnaligned 17.08 15.27 0.89x
BenchmarkHash1KUnaligned 340.65 462.75 1.36x
BenchmarkHash8KUnaligned 393.45 559.08 1.42x
For comparison, on the same machine, openssl 0.9.8r reports
its md5 speed as 350 MB/s for 1K and 410 MB/s for 8K.
On an Intel Xeon E5520:
benchmark old ns/op new ns/op delta
BenchmarkHash8Bytes 565 607 +7.43%
BenchmarkHash1K 3753 2475 -34.05%
BenchmarkHash8K 25945 16250 -37.37%
BenchmarkHash8BytesUnaligned 559 594 +6.26%
BenchmarkHash1KUnaligned 3754 2474 -34.10%
BenchmarkHash8KUnaligned 26011 16359 -37.11%
benchmark old MB/s new MB/s speedup
BenchmarkHash8Bytes 14.15 13.17 0.93x
BenchmarkHash1K 272.83 413.58 1.52x
BenchmarkHash8K 315.74 504.11 1.60x
BenchmarkHash8BytesUnaligned 14.31 13.46 0.94x
BenchmarkHash1KUnaligned 272.73 413.78 1.52x
BenchmarkHash8KUnaligned 314.93 500.73 1.59x
For comparison, on the same machine, openssl 1.0.1 reports
its md5 speed as 443 MB/s for 1K and 513 MB/s for 8K.
-- 386 --
On a MacBookPro10,2 (Core i5):
benchmark old ns/op new ns/op delta
BenchmarkHash8Bytes 602 670 +11.30%
BenchmarkHash1K 4038 2549 -36.87%
BenchmarkHash8K 27879 16690 -40.13%
BenchmarkHash8BytesUnaligned 602 670 +11.30%
BenchmarkHash1KUnaligned 4025 2546 -36.75%
BenchmarkHash8KUnaligned 27844 16692 -40.05%
benchmark old MB/s new MB/s speedup
BenchmarkHash8Bytes 13.28 11.93 0.90x
BenchmarkHash1K 253.58 401.69 1.58x
BenchmarkHash8K 293.83 490.81 1.67x
BenchmarkHash8BytesUnaligned 13.27 11.94 0.90x
BenchmarkHash1KUnaligned 254.40 402.05 1.58x
BenchmarkHash8KUnaligned 294.21 490.77 1.67x
On an Intel Xeon E5520:
benchmark old ns/op new ns/op delta
BenchmarkHash8Bytes 752 716 -4.79%
BenchmarkHash1K 5307 2799 -47.26%
BenchmarkHash8K 36993 18042 -51.23%
BenchmarkHash8BytesUnaligned 748 730 -2.41%
BenchmarkHash1KUnaligned 5301 2795 -47.27%
BenchmarkHash8KUnaligned 36983 18085 -51.10%
benchmark old MB/s new MB/s speedup
BenchmarkHash8Bytes 10.64 11.16 1.05x
BenchmarkHash1K 192.93 365.80 1.90x
BenchmarkHash8K 221.44 454.03 2.05x
BenchmarkHash8BytesUnaligned 10.69 10.95 1.02x
BenchmarkHash1KUnaligned 193.15 366.36 1.90x
BenchmarkHash8KUnaligned 221.51 452.96 2.04x
R=agl
CC=golang-dev
https://golang.org/cl/
7621049
}
var program = `
+// DO NOT EDIT.
+// Generate with: go run gen.go{{if .Full}} -full{{end}} | gofmt >md5block.go
+
+// +build !amd64
+
package md5
import (
}
{{end}}
+const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
+
+var littleEndian bool
+
+func init() {
+ x := uint32(0x04030201)
+ y := [4]byte{0x1, 0x2, 0x3, 0x4}
+ littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
+}
+
func block(dig *digest, p []byte) {
a := dig.s[0]
b := dig.s[1]
aa, bb, cc, dd := a, b, c, d
// This is a constant condition - it is not evaluated on each iteration.
- if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+ if x86 {
// MD5 was designed so that x86 processors can just iterate
// over the block data directly as uint32s, and we generate
// less code and run 1.3x faster if we take advantage of that.
// My apologies.
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
- } else if uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+ } else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
} else {
X = &xbuf
+// DO NOT EDIT.
+// Generate with: go run gen.go -full | gofmt >md5block.go
+
+// +build !amd64,!386
+
package md5
import (
--- /dev/null
+// Original source:
+// http://www.zorinaq.com/papers/md5-amd64.html
+// http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 8a assembly, and adjusted for 386,
+// by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+ XORL c, BP; \
+ LEAL const(a)(DI*1), a; \
+ ANDL b, BP; \
+ XORL d, BP; \
+ MOVL (index*4)(SI), DI; \
+ ADDL BP, a; \
+ ROLL $shift, a; \
+ MOVL c, BP; \
+ ADDL b, a
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+ LEAL const(a)(DI*1),a; \
+ MOVL d, DI; \
+ ANDL b, DI; \
+ MOVL d, BP; \
+ NOTL BP; \
+ ANDL c, BP; \
+ ORL DI, BP; \
+ MOVL (index*4)(SI),DI; \
+ ADDL BP, a; \
+ ROLL $shift, a; \
+ ADDL b, a
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+ LEAL const(a)(DI*1),a; \
+ MOVL (index*4)(SI),DI; \
+ XORL d, BP; \
+ XORL b, BP; \
+ ADDL BP, a; \
+ ROLL $shift, a; \
+ MOVL b, BP; \
+ ADDL b, a
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+ LEAL const(a)(DI*1),a; \
+ ORL b, BP; \
+ XORL c, BP; \
+ ADDL BP, a; \
+ MOVL (index*4)(SI),DI; \
+ MOVL $0xffffffff, BP; \
+ ROLL $shift, a; \
+ XORL c, BP; \
+ ADDL b, a
+
+TEXT ·block(SB),7,$24-16
+ MOVL dig+0(FP), BP
+ MOVL p+4(FP), SI
+ MOVL n+8(FP), DX
+ SHRL $6, DX
+ SHLL $6, DX
+
+ LEAL (SI)(DX*1), DI
+ MOVL (0*4)(BP), AX
+ MOVL (1*4)(BP), BX
+ MOVL (2*4)(BP), CX
+ MOVL (3*4)(BP), DX
+
+ CMPL SI, DI
+ JEQ end
+
+ MOVL DI, 16(SP)
+
+loop:
+ MOVL AX, 0(SP)
+ MOVL BX, 4(SP)
+ MOVL CX, 8(SP)
+ MOVL DX, 12(SP)
+
+ MOVL (0*4)(SI), DI
+ MOVL DX, BP
+
+ ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+ ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+ ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+ ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+ ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+ ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+ ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+ ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+ ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+ ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+ ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+ ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+ ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+ ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+ ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+ ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+ MOVL (1*4)(SI), DI
+ MOVL DX, BP
+
+ ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+ ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+ ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+ ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+ ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+ ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+ ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+ ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+ ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+ ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+ ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+ ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+ ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+ ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+ ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+ ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+
+ MOVL (5*4)(SI), DI
+ MOVL CX, BP
+
+ ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+ ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+ ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+ ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+ ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+ ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+ ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+ ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+ ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+ ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+ ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+ ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+ ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+ ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+ ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+ ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+ MOVL (0*4)(SI), DI
+ MOVL $0xffffffff, BP
+ XORL DX, BP
+
+ ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+ ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+ ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+ ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+ ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+ ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+ ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+ ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+ ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+ ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+ ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+ ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+ ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+ ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+ ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+ ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+ ADDL 0(SP), AX
+ ADDL 4(SP), BX
+ ADDL 8(SP), CX
+ ADDL 12(SP), DX
+
+ ADDL $64, SI
+ CMPL SI, 16(SP)
+ JB loop
+
+end:
+ MOVL dig+0(FP), BP
+ MOVL AX, (0*4)(BP)
+ MOVL BX, (1*4)(BP)
+ MOVL CX, (2*4)(BP)
+ MOVL DX, (3*4)(BP)
+ RET
--- /dev/null
+// Original source:
+// http://www.zorinaq.com/papers/md5-amd64.html
+// http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 6a assembly by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+TEXT ·block(SB),7,$0-32
+ MOVQ dig+0(FP), BP
+ MOVQ p+8(FP), SI
+ MOVQ n+16(FP), DX
+ SHRQ $6, DX
+ SHLQ $6, DX
+
+ LEAQ (SI)(DX*1), DI
+ MOVL (0*4)(BP), AX
+ MOVL (1*4)(BP), BX
+ MOVL (2*4)(BP), CX
+ MOVL (3*4)(BP), DX
+
+ CMPQ SI, DI
+ JEQ end
+
+loop:
+ MOVL AX, R12
+ MOVL BX, R13
+ MOVL CX, R14
+ MOVL DX, R15
+
+ MOVL (0*4)(SI), R8
+ MOVL DX, R9
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+ XORL c, R9; \
+ LEAL const(a)(R8*1), a; \
+ ANDL b, R9; \
+ XORL d, R9; \
+ MOVL (index*4)(SI), R8; \
+ ADDL R9, a; \
+ ROLL $shift, a; \
+ MOVL c, R9; \
+ ADDL b, a
+
+ ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+ ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+ ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+ ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+ ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+ ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+ ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+ ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+ ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+ ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+ ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+ ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+ ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+ ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+ ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+ ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+ MOVL (1*4)(SI), R8
+ MOVL DX, R9
+ MOVL DX, R10
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+ NOTL R9; \
+ LEAL const(a)(R8*1),a; \
+ ANDL b, R10; \
+ ANDL c, R9; \
+ MOVL (index*4)(SI),R8; \
+ ORL R9, R10; \
+ MOVL c, R9; \
+ ADDL R10, a; \
+ MOVL c, R10; \
+ ROLL $shift, a; \
+ ADDL b, a
+
+ ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+ ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+ ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+ ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+ ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+ ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+ ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+ ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+ ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+ ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+ ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+ ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+ ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+ ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+ ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+ ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+
+ MOVL (5*4)(SI), R8
+ MOVL CX, R9
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+ LEAL const(a)(R8*1),a; \
+ MOVL (index*4)(SI),R8; \
+ XORL d, R9; \
+ XORL b, R9; \
+ ADDL R9, a; \
+ ROLL $shift, a; \
+ MOVL b, R9; \
+ ADDL b, a
+
+ ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+ ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+ ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+ ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+ ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+ ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+ ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+ ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+ ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+ ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+ ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+ ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+ ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+ ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+ ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+ ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+ MOVL (0*4)(SI), R8
+ MOVL $0xffffffff, R9
+ XORL DX, R9
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+ LEAL const(a)(R8*1),a; \
+ ORL b, R9; \
+ XORL c, R9; \
+ ADDL R9, a; \
+ MOVL (index*4)(SI),R8; \
+ MOVL $0xffffffff, R9; \
+ ROLL $shift, a; \
+ XORL c, R9; \
+ ADDL b, a
+
+ ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+ ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+ ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+ ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+ ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+ ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+ ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+ ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+ ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+ ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+ ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+ ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+ ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+ ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+ ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+ ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+ ADDL R12, AX
+ ADDL R13, BX
+ ADDL R14, CX
+ ADDL R15, DX
+
+ ADDQ $64, SI
+ CMPQ SI, DI
+ JB loop
+
+end:
+ MOVL AX, (0*4)(BP)
+ MOVL BX, (1*4)(BP)
+ MOVL CX, (2*4)(BP)
+ MOVL DX, (3*4)(BP)
+ RET
--- /dev/null
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 386
+
+package md5
+
+func block(dig *digest, p []byte)