]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/md5: faster amd64, 386 implementations
authorRuss Cox <rsc@golang.org>
Thu, 21 Mar 2013 15:26:00 +0000 (11:26 -0400)
committerRuss Cox <rsc@golang.org>
Thu, 21 Mar 2013 15:26:00 +0000 (11:26 -0400)
-- amd64 --

On a MacBookPro10,2 (Core i5):

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   471          524  +11.25%
BenchmarkHash1K                      3018         2220  -26.44%
BenchmarkHash8K                     20634        14604  -29.22%
BenchmarkHash8BytesUnaligned          468          523  +11.75%
BenchmarkHash1KUnaligned             3006         2212  -26.41%
BenchmarkHash8KUnaligned            20820        14652  -29.63%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 16.98        15.26    0.90x
BenchmarkHash1K                    339.26       461.19    1.36x
BenchmarkHash8K                    397.00       560.92    1.41x
BenchmarkHash8BytesUnaligned        17.08        15.27    0.89x
BenchmarkHash1KUnaligned           340.65       462.75    1.36x
BenchmarkHash8KUnaligned           393.45       559.08    1.42x

For comparison, on the same machine, openssl 0.9.8r reports
its md5 speed as 350 MB/s for 1K and 410 MB/s for 8K.

On an Intel Xeon E5520:

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   565          607   +7.43%
BenchmarkHash1K                      3753         2475  -34.05%
BenchmarkHash8K                     25945        16250  -37.37%
BenchmarkHash8BytesUnaligned          559          594   +6.26%
BenchmarkHash1KUnaligned             3754         2474  -34.10%
BenchmarkHash8KUnaligned            26011        16359  -37.11%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 14.15        13.17    0.93x
BenchmarkHash1K                    272.83       413.58    1.52x
BenchmarkHash8K                    315.74       504.11    1.60x
BenchmarkHash8BytesUnaligned        14.31        13.46    0.94x
BenchmarkHash1KUnaligned           272.73       413.78    1.52x
BenchmarkHash8KUnaligned           314.93       500.73    1.59x

For comparison, on the same machine, openssl 1.0.1 reports
its md5 speed as 443 MB/s for 1K and 513 MB/s for 8K.

-- 386 --

On a MacBookPro10,2 (Core i5):

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   602          670  +11.30%
BenchmarkHash1K                      4038         2549  -36.87%
BenchmarkHash8K                     27879        16690  -40.13%
BenchmarkHash8BytesUnaligned          602          670  +11.30%
BenchmarkHash1KUnaligned             4025         2546  -36.75%
BenchmarkHash8KUnaligned            27844        16692  -40.05%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 13.28        11.93    0.90x
BenchmarkHash1K                    253.58       401.69    1.58x
BenchmarkHash8K                    293.83       490.81    1.67x
BenchmarkHash8BytesUnaligned        13.27        11.94    0.90x
BenchmarkHash1KUnaligned           254.40       402.05    1.58x
BenchmarkHash8KUnaligned           294.21       490.77    1.67x

On an Intel Xeon E5520:

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   752          716   -4.79%
BenchmarkHash1K                      5307         2799  -47.26%
BenchmarkHash8K                     36993        18042  -51.23%
BenchmarkHash8BytesUnaligned          748          730   -2.41%
BenchmarkHash1KUnaligned             5301         2795  -47.27%
BenchmarkHash8KUnaligned            36983        18085  -51.10%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 10.64        11.16    1.05x
BenchmarkHash1K                    192.93       365.80    1.90x
BenchmarkHash8K                    221.44       454.03    2.05x
BenchmarkHash8BytesUnaligned        10.69        10.95    1.02x
BenchmarkHash1KUnaligned           193.15       366.36    1.90x
BenchmarkHash8KUnaligned           221.51       452.96    2.04x

R=agl
CC=golang-dev
https://golang.org/cl/7621049

src/pkg/crypto/md5/gen.go
src/pkg/crypto/md5/md5block.go
src/pkg/crypto/md5/md5block_386.s [new file with mode: 0644]
src/pkg/crypto/md5/md5block_amd64.s [new file with mode: 0644]
src/pkg/crypto/md5/md5block_decl.go [new file with mode: 0644]

index 966bdae267b3a09b3b67c88c85fca77c498ccb8e..275b4aeea396b89c3719a0469d7d4f6049cb7239 100644 (file)
@@ -161,6 +161,11 @@ var data = Data{
 }
 
 var program = `
+// DO NOT EDIT.
+// Generate with: go run gen.go{{if .Full}} -full{{end}} | gofmt >md5block.go
+
+// +build !amd64
+
 package md5
 
 import (
@@ -186,6 +191,16 @@ import (
        }
 {{end}}
 
+const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
+
+var littleEndian bool
+
+func init() {
+       x := uint32(0x04030201)
+       y := [4]byte{0x1, 0x2, 0x3, 0x4}
+       littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
+}
+
 func block(dig *digest, p []byte) {
        a := dig.s[0]
        b := dig.s[1]
@@ -197,13 +212,13 @@ func block(dig *digest, p []byte) {
                aa, bb, cc, dd := a, b, c, d
 
                // This is a constant condition - it is not evaluated on each iteration.
-               if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+               if x86 {
                        // MD5 was designed so that x86 processors can just iterate
                        // over the block data directly as uint32s, and we generate
                        // less code and run 1.3x faster if we take advantage of that.
                        // My apologies.
                        X = (*[16]uint32)(unsafe.Pointer(&p[0]))
-               } else if uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+               } else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
                        X = (*[16]uint32)(unsafe.Pointer(&p[0]))
                } else {
                        X = &xbuf
index 0ca42177403c30565caab6b9ed900faacf3bc29b..a376fbee99b9e66d7e4d4caa43a28d81dc4011aa 100644 (file)
@@ -1,3 +1,8 @@
+// DO NOT EDIT.
+// Generate with: go run gen.go -full | gofmt >md5block.go
+
+// +build !amd64,!386
+
 package md5
 
 import (
diff --git a/src/pkg/crypto/md5/md5block_386.s b/src/pkg/crypto/md5/md5block_386.s
new file mode 100644 (file)
index 0000000..1083d83
--- /dev/null
@@ -0,0 +1,180 @@
+// Original source:
+//     http://www.zorinaq.com/papers/md5-amd64.html
+//     http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 8a assembly, and adjusted for 386,
+// by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+       XORL    c, BP; \
+       LEAL    const(a)(DI*1), a; \
+       ANDL    b, BP; \
+       XORL d, BP; \
+       MOVL (index*4)(SI), DI; \
+       ADDL BP, a; \
+       ROLL $shift, a; \
+       MOVL c, BP; \
+       ADDL b, a
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+       LEAL    const(a)(DI*1),a; \
+       MOVL    d,              DI; \
+       ANDL    b,              DI; \
+       MOVL    d,              BP; \
+       NOTL    BP; \
+       ANDL    c,              BP; \
+       ORL     DI,             BP; \
+       MOVL    (index*4)(SI),DI; \
+       ADDL    BP,             a; \
+       ROLL    $shift, a; \
+       ADDL    b,              a
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+       LEAL    const(a)(DI*1),a; \
+       MOVL    (index*4)(SI),DI; \
+       XORL    d,              BP; \
+       XORL    b,              BP; \
+       ADDL    BP,             a; \
+       ROLL    $shift,         a; \
+       MOVL    b,              BP; \
+       ADDL    b,              a
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+       LEAL    const(a)(DI*1),a; \
+       ORL     b,              BP; \
+       XORL    c,              BP; \
+       ADDL    BP,             a; \
+       MOVL    (index*4)(SI),DI; \
+       MOVL    $0xffffffff,    BP; \
+       ROLL    $shift,         a; \
+       XORL    c,              BP; \
+       ADDL    b,              a
+
+TEXT   ·block(SB),7,$24-16
+       MOVL    dig+0(FP),      BP
+       MOVL    p+4(FP),        SI
+       MOVL    n+8(FP), DX
+       SHRL    $6,             DX
+       SHLL    $6,             DX
+
+       LEAL    (SI)(DX*1),     DI
+       MOVL    (0*4)(BP),      AX
+       MOVL    (1*4)(BP),      BX
+       MOVL    (2*4)(BP),      CX
+       MOVL    (3*4)(BP),      DX
+
+       CMPL    SI,             DI
+       JEQ     end
+
+       MOVL    DI,             16(SP)
+
+loop:
+       MOVL    AX,             0(SP)
+       MOVL    BX,             4(SP)
+       MOVL    CX,             8(SP)
+       MOVL    DX,             12(SP)
+
+       MOVL    (0*4)(SI),      DI
+       MOVL    DX,             BP
+
+       ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+       ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+       ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+       ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+       ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+       ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+       ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+       ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+       ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+       ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+       ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+       ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+       ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+       ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+       ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+       ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+       MOVL    (1*4)(SI),      DI
+       MOVL    DX,             BP
+
+       ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+       ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+       ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+       ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+       ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+       ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+       ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+       ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+       ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+       ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+       ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+       ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+       ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+       ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+       ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+       ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+       MOVL    (5*4)(SI),      DI
+       MOVL    CX,             BP
+
+       ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+       ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+       ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+       ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+       ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+       ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+       ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+       ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+       ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+       ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+       ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+       ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+       ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+       ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+       ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+       ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+       MOVL    (0*4)(SI),      DI
+       MOVL    $0xffffffff,    BP
+       XORL    DX,             BP
+
+       ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+       ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+       ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+       ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+       ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+       ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+       ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+       ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+       ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+       ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+       ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+       ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+       ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+       ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+       ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+       ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+       ADDL    0(SP),  AX
+       ADDL    4(SP),  BX
+       ADDL    8(SP),  CX
+       ADDL    12(SP), DX
+
+       ADDL    $64,            SI
+       CMPL    SI,             16(SP)
+       JB      loop
+
+end:
+       MOVL    dig+0(FP),      BP
+       MOVL    AX,             (0*4)(BP)
+       MOVL    BX,             (1*4)(BP)
+       MOVL    CX,             (2*4)(BP)
+       MOVL    DX,             (3*4)(BP)
+       RET
diff --git a/src/pkg/crypto/md5/md5block_amd64.s b/src/pkg/crypto/md5/md5block_amd64.s
new file mode 100644 (file)
index 0000000..74a361e
--- /dev/null
@@ -0,0 +1,177 @@
+// Original source:
+//     http://www.zorinaq.com/papers/md5-amd64.html
+//     http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 6a assembly by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+TEXT   ·block(SB),7,$0-32
+       MOVQ    dig+0(FP),      BP
+       MOVQ    p+8(FP),        SI
+       MOVQ    n+16(FP), DX
+       SHRQ    $6,             DX
+       SHLQ    $6,             DX
+
+       LEAQ    (SI)(DX*1),     DI
+       MOVL    (0*4)(BP),      AX
+       MOVL    (1*4)(BP),      BX
+       MOVL    (2*4)(BP),      CX
+       MOVL    (3*4)(BP),      DX
+
+       CMPQ    SI,             DI
+       JEQ     end
+
+loop:
+       MOVL    AX,             R12
+       MOVL    BX,             R13
+       MOVL    CX,             R14
+       MOVL    DX,             R15
+
+       MOVL    (0*4)(SI),      R8
+       MOVL    DX,             R9
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+       XORL    c, R9; \
+       LEAL    const(a)(R8*1), a; \
+       ANDL    b, R9; \
+       XORL d, R9; \
+       MOVL (index*4)(SI), R8; \
+       ADDL R9, a; \
+       ROLL $shift, a; \
+       MOVL c, R9; \
+       ADDL b, a
+
+       ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+       ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+       ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+       ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+       ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+       ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+       ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+       ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+       ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+       ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+       ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+       ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+       ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+       ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+       ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+       ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+       MOVL    (1*4)(SI),      R8
+       MOVL    DX,             R9
+       MOVL    DX,             R10
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+       NOTL    R9; \
+       LEAL    const(a)(R8*1),a; \
+       ANDL    b,              R10; \
+       ANDL    c,              R9; \
+       MOVL    (index*4)(SI),R8; \
+       ORL     R9,             R10; \
+       MOVL    c,              R9; \
+       ADDL    R10,            a; \
+       MOVL    c,              R10; \
+       ROLL    $shift, a; \
+       ADDL    b,              a
+
+       ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+       ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+       ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+       ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+       ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+       ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+       ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+       ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+       ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+       ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+       ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+       ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+       ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+       ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+       ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+       ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+       MOVL    (5*4)(SI),      R8
+       MOVL    CX,             R9
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+       LEAL    const(a)(R8*1),a; \
+       MOVL    (index*4)(SI),R8; \
+       XORL    d,              R9; \
+       XORL    b,              R9; \
+       ADDL    R9,             a; \
+       ROLL    $shift,         a; \
+       MOVL    b,              R9; \
+       ADDL    b,              a
+
+       ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+       ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+       ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+       ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+       ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+       ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+       ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+       ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+       ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+       ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+       ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+       ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+       ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+       ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+       ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+       ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+       MOVL    (0*4)(SI),      R8
+       MOVL    $0xffffffff,    R9
+       XORL    DX,             R9
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+       LEAL    const(a)(R8*1),a; \
+       ORL     b,              R9; \
+       XORL    c,              R9; \
+       ADDL    R9,             a; \
+       MOVL    (index*4)(SI),R8; \
+       MOVL    $0xffffffff,    R9; \
+       ROLL    $shift,         a; \
+       XORL    c,              R9; \
+       ADDL    b,              a
+       
+       ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+       ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+       ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+       ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+       ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+       ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+       ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+       ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+       ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+       ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+       ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+       ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+       ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+       ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+       ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+       ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+       ADDL    R12,    AX
+       ADDL    R13,    BX
+       ADDL    R14,    CX
+       ADDL    R15,    DX
+
+       ADDQ    $64,            SI
+       CMPQ    SI,             DI
+       JB      loop
+
+end:
+       MOVL    AX,             (0*4)(BP)
+       MOVL    BX,             (1*4)(BP)
+       MOVL    CX,             (2*4)(BP)
+       MOVL    DX,             (3*4)(BP)
+       RET
diff --git a/src/pkg/crypto/md5/md5block_decl.go b/src/pkg/crypto/md5/md5block_decl.go
new file mode 100644 (file)
index 0000000..14190c6
--- /dev/null
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 386
+
+package md5
+
+func block(dig *digest, p []byte)