crypto/md5: native arm assembler version

author Nick Craig-Wood <nick@craig-wood.com>

Thu, 25 Jul 2013 03:28:27 +0000 (13:28 +1000)

committer Dave Cheney <dave@cheney.net>

Thu, 25 Jul 2013 03:28:27 +0000 (13:28 +1000)
author Nick Craig-Wood <nick@craig-wood.com>
Thu, 25 Jul 2013 03:28:27 +0000 (13:28 +1000)
committer Dave Cheney <dave@cheney.net>
Thu, 25 Jul 2013 03:28:27 +0000 (13:28 +1000)
diff --git a/src/pkg/crypto/md5/gen.go b/src/pkg/crypto/md5/gen.go

index 275b4aeea396b89c3719a0469d7d4f6049cb7239..ccaa7c13d38d56d9d0af077bad1d4f35de862d9c 100644 (file)
--- a/src/pkg/crypto/md5/gen.go
+++ b/src/pkg/crypto/md5/gen.go
@@ -164,7 +164,7 @@ var program = `
  // DO NOT EDIT.
  // Generate with: go run gen.go{{if .Full}} -full{{end}} | gofmt >md5block.go
  
-// +build !amd64
+// +build !amd64,!386,!arm
  
  package md5
  
diff --git a/src/pkg/crypto/md5/md5_test.go b/src/pkg/crypto/md5/md5_test.go

index 4901655e778f861ce61af0dc880a892e43ae0020..b51e3044179a0968b59d32ce4db5331c1ff9f7e4 100644 (file)
--- a/src/pkg/crypto/md5/md5_test.go
+++ b/src/pkg/crypto/md5/md5_test.go
@@ -81,6 +81,30 @@ func TestGolden(t *testing.T) {
         }
  }
  
+func TestLarge(t *testing.T) {
+       const N = 10000
+       ok := "2bb571599a4180e1d542f76904adc3df" // md5sum of "0123456789" * 1000
+       block := make([]byte, 10004)
+       c := New()
+       for offset := 0; offset < 4; offset++ {
+               for i := 0; i < N; i++ {
+                       block[offset+i] = '0' + byte(i%10)
+               }
+               for blockSize := 10; blockSize <= N; blockSize *= 10 {
+                       blocks := N / blockSize
+                       b := block[offset : offset+blockSize]
+                       c.Reset()
+                       for i := 0; i < blocks; i++ {
+                               c.Write(b)
+                       }
+                       s := fmt.Sprintf("%x", c.Sum(nil))
+                       if s != ok {
+                               t.Fatalf("md5 TestLarge offset=%d, blockSize=%d = %s want %s", offset, blockSize, s, ok)
+                       }
+               }
+       }
+}
+
  func ExampleNew() {
         h := New()
         io.WriteString(h, "The fog is getting thicker!")
diff --git a/src/pkg/crypto/md5/md5block.go b/src/pkg/crypto/md5/md5block.go

index a376fbee99b9e66d7e4d4caa43a28d81dc4011aa..3e739e36ffd764790a20d6b15821f29ab396dc04 100644 (file)
--- a/src/pkg/crypto/md5/md5block.go
+++ b/src/pkg/crypto/md5/md5block.go
@@ -1,7 +1,7 @@
  // DO NOT EDIT.
  // Generate with: go run gen.go -full | gofmt >md5block.go
  
-// +build !amd64,!386
+// +build !amd64,!386,!arm
  
  package md5
  
diff --git a/src/pkg/crypto/md5/md5block_arm.s b/src/pkg/crypto/md5/md5block_arm.s

new file mode 100644 (file)

index 0000000..9a068c3
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_arm.s
@@ -0,0 +1,297 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// ARM version of md5block.go
+
+// Register definitions
+table = 0      // Pointer to MD5 constants table
+data = 1       // Pointer to data to hash
+a = 2          // MD5 accumulator
+b = 3          // MD5 accumulator
+c = 4          // MD5 accumulator
+d = 5          // MD5 accumulator
+c0 = 6         // MD5 constant
+c1 = 7         // MD5 constant
+c2 = 8         // MD5 constant
+// r9, r10 are forbidden
+// r11 is OK provided you check the assembler that no synthetic instructions use it
+c3 = 11                // MD5 constant
+t0 = 12                // temporary
+t1 = 14                // temporary
+
+// func block(dig *digest, p []byte)
+// 0(FP) is *digest
+// 4(FP) is p.array (struct Slice)
+// 8(FP) is p.len
+//12(FP) is p.cap
+//
+// Stack frame
+p_end = -4     // -4(SP) pointer to the end of data
+p_data = -8    // -8(SP) current data pointer
+buf = -8-4*16  //-72(SP) 16 words temporary buffer
+               // 3 words at 4..12(R13) for called routine parameters
+
+TEXT   ·block(SB), 7, $84-16
+       MOVW    p+4(FP), R(data)        // pointer to the data
+       MOVW    p_len+8(FP), R(t0)      // number of bytes
+       ADD     R(data), R(t0)
+       MOVW    R(t0), p_end(SP)        // pointer to end of data
+
+loop:
+       MOVW    R(data), p_data(SP)     // Save R(data)
+       AND.S   $3, R(data), R(t0)      // TST $3, R(data) not working see issue 5921
+       BEQ     aligned                 // aligned detected - skip copy
+
+       // Copy the unaligned source data into the aligned temporary buffer
+       // memove(to=4(R13), from=8(R13), n=12(R13)) - Corrupts all registers
+       MOVW    $buf(SP), R(table)      // to
+       MOVW    $64, R(c0)              // n
+       MOVM.IB [R(table),R(data),R(c0)], (R13)
+       BL      runtime·memmove(SB)
+
+       // Point to the local aligned copy of the data
+       MOVW    $buf(SP), R(data)
+
+aligned:
+       // Point to the table of constants
+       // A PC relative add would be cheaper than this
+       MOVW    $·table(SB), R(table)
+
+       // Load up initial MD5 accumulator
+       MOVW    dig+0(FP), R(c0)
+       MOVM.IA (R(c0)), [R(a),R(b),R(c),R(d)]
+
+// a += (((c^d)&b)^d) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND1(a, b, c, d, index, shift, const) \
+       EOR     R(c), R(d), R(t0)               ; \
+       AND     R(b), R(t0)                     ; \
+       EOR     R(d), R(t0)                     ; \
+       MOVW    (index<<2)(R(data)), R(t1)      ; \
+       ADD     R(t1), R(t0)                    ; \
+       ADD     R(const), R(t0)                 ; \
+       ADD     R(t0), R(a)                     ; \
+       ADD     R(a)@>(32-shift), R(b), R(a)    ;
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND1(a, b, c, d,  0,  7, c0)
+       ROUND1(d, a, b, c,  1, 12, c1)
+       ROUND1(c, d, a, b,  2, 17, c2)
+       ROUND1(b, c, d, a,  3, 22, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND1(a, b, c, d,  4,  7, c0)
+       ROUND1(d, a, b, c,  5, 12, c1)
+       ROUND1(c, d, a, b,  6, 17, c2)
+       ROUND1(b, c, d, a,  7, 22, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND1(a, b, c, d,  8,  7, c0)
+       ROUND1(d, a, b, c,  9, 12, c1)
+       ROUND1(c, d, a, b, 10, 17, c2)
+       ROUND1(b, c, d, a, 11, 22, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND1(a, b, c, d, 12,  7, c0)
+       ROUND1(d, a, b, c, 13, 12, c1)
+       ROUND1(c, d, a, b, 14, 17, c2)
+       ROUND1(b, c, d, a, 15, 22, c3)
+
+// a += (((b^c)&d)^c) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND2(a, b, c, d, index, shift, const) \
+       EOR     R(b), R(c), R(t0)               ; \
+       AND     R(d), R(t0)                     ; \
+       EOR     R(c), R(t0)                     ; \
+       MOVW    (index<<2)(R(data)), R(t1)      ; \
+       ADD     R(t1), R(t0)                    ; \
+       ADD     R(const), R(t0)                 ; \
+       ADD     R(t0), R(a)                     ; \
+       ADD     R(a)@>(32-shift), R(b), R(a)    ;
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND2(a, b, c, d,  1,  5, c0)
+       ROUND2(d, a, b, c,  6,  9, c1)
+       ROUND2(c, d, a, b, 11, 14, c2)
+       ROUND2(b, c, d, a,  0, 20, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND2(a, b, c, d,  5,  5, c0)
+       ROUND2(d, a, b, c, 10,  9, c1)
+       ROUND2(c, d, a, b, 15, 14, c2)
+       ROUND2(b, c, d, a,  4, 20, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND2(a, b, c, d,  9,  5, c0)
+       ROUND2(d, a, b, c, 14,  9, c1)
+       ROUND2(c, d, a, b,  3, 14, c2)
+       ROUND2(b, c, d, a,  8, 20, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND2(a, b, c, d, 13,  5, c0)
+       ROUND2(d, a, b, c,  2,  9, c1)
+       ROUND2(c, d, a, b,  7, 14, c2)
+       ROUND2(b, c, d, a, 12, 20, c3)
+
+// a += (b^c^d) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND3(a, b, c, d, index, shift, const) \
+       EOR     R(b), R(c), R(t0)               ; \
+       EOR     R(d), R(t0)                     ; \
+       MOVW    (index<<2)(R(data)), R(t1)      ; \
+       ADD     R(t1), R(t0)                    ; \
+       ADD     R(const), R(t0)                 ; \
+       ADD     R(t0), R(a)                     ; \
+       ADD     R(a)@>(32-shift), R(b), R(a)    ;
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND3(a, b, c, d,  5,  4, c0)
+       ROUND3(d, a, b, c,  8, 11, c1)
+       ROUND3(c, d, a, b, 11, 16, c2)
+       ROUND3(b, c, d, a, 14, 23, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND3(a, b, c, d,  1,  4, c0)
+       ROUND3(d, a, b, c,  4, 11, c1)
+       ROUND3(c, d, a, b,  7, 16, c2)
+       ROUND3(b, c, d, a, 10, 23, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND3(a, b, c, d, 13,  4, c0)
+       ROUND3(d, a, b, c,  0, 11, c1)
+       ROUND3(c, d, a, b,  3, 16, c2)
+       ROUND3(b, c, d, a,  6, 23, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND3(a, b, c, d,  9,  4, c0)
+       ROUND3(d, a, b, c, 12, 11, c1)
+       ROUND3(c, d, a, b, 15, 16, c2)
+       ROUND3(b, c, d, a,  2, 23, c3)
+
+// a += (c^(b|^d)) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND4(a, b, c, d, index, shift, const) \
+       MVN     R(d), R(t0)                     ; \
+       ORR     R(b), R(t0)                     ; \
+       EOR     R(c), R(t0)                     ; \
+       MOVW    (index<<2)(R(data)), R(t1)      ; \
+       ADD     R(t1), R(t0)                    ; \
+       ADD     R(const), R(t0)                 ; \
+       ADD     R(t0), R(a)                     ; \
+       ADD     R(a)@>(32-shift), R(b), R(a)    ;
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND4(a, b, c, d,  0,  6, c0)
+       ROUND4(d, a, b, c,  7, 10, c1)
+       ROUND4(c, d, a, b, 14, 15, c2)
+       ROUND4(b, c, d, a,  5, 21, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND4(a, b, c, d, 12,  6, c0)
+       ROUND4(d, a, b, c,  3, 10, c1)
+       ROUND4(c, d, a, b, 10, 15, c2)
+       ROUND4(b, c, d, a,  1, 21, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND4(a, b, c, d,  8,  6, c0)
+       ROUND4(d, a, b, c, 15, 10, c1)
+       ROUND4(c, d, a, b,  6, 15, c2)
+       ROUND4(b, c, d, a, 13, 21, c3)
+
+       MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+       ROUND4(a, b, c, d,  4,  6, c0)
+       ROUND4(d, a, b, c, 11, 10, c1)
+       ROUND4(c, d, a, b,  2, 15, c2)
+       ROUND4(b, c, d, a,  9, 21, c3)
+
+       MOVW    dig+0(FP), R(t0)
+       MOVM.IA (R(t0)), [R(c0),R(c1),R(c2),R(c3)]
+
+       ADD     R(c0), R(a)
+       ADD     R(c1), R(b)
+       ADD     R(c2), R(c)
+       ADD     R(c3), R(d)
+
+       MOVM.IA [R(a),R(b),R(c),R(d)], (R(t0))
+
+       MOVW    p_data(SP), R(data)
+       MOVW    p_end(SP), R(t0)
+       ADD     $64, R(data)
+       CMP     R(t0), R(data)
+       BLO     loop
+
+       RET
+
+// MD5 constants table
+
+       // Round 1
+       DATA    ·table+0x00(SB)/4, $0xd76aa478
+       DATA    ·table+0x04(SB)/4, $0xe8c7b756
+       DATA    ·table+0x08(SB)/4, $0x242070db
+       DATA    ·table+0x0c(SB)/4, $0xc1bdceee
+       DATA    ·table+0x10(SB)/4, $0xf57c0faf
+       DATA    ·table+0x14(SB)/4, $0x4787c62a
+       DATA    ·table+0x18(SB)/4, $0xa8304613
+       DATA    ·table+0x1c(SB)/4, $0xfd469501
+       DATA    ·table+0x20(SB)/4, $0x698098d8
+       DATA    ·table+0x24(SB)/4, $0x8b44f7af
+       DATA    ·table+0x28(SB)/4, $0xffff5bb1
+       DATA    ·table+0x2c(SB)/4, $0x895cd7be
+       DATA    ·table+0x30(SB)/4, $0x6b901122
+       DATA    ·table+0x34(SB)/4, $0xfd987193
+       DATA    ·table+0x38(SB)/4, $0xa679438e
+       DATA    ·table+0x3c(SB)/4, $0x49b40821
+       // Round 2
+       DATA    ·table+0x40(SB)/4, $0xf61e2562
+       DATA    ·table+0x44(SB)/4, $0xc040b340
+       DATA    ·table+0x48(SB)/4, $0x265e5a51
+       DATA    ·table+0x4c(SB)/4, $0xe9b6c7aa
+       DATA    ·table+0x50(SB)/4, $0xd62f105d
+       DATA    ·table+0x54(SB)/4, $0x02441453
+       DATA    ·table+0x58(SB)/4, $0xd8a1e681
+       DATA    ·table+0x5c(SB)/4, $0xe7d3fbc8
+       DATA    ·table+0x60(SB)/4, $0x21e1cde6
+       DATA    ·table+0x64(SB)/4, $0xc33707d6
+       DATA    ·table+0x68(SB)/4, $0xf4d50d87
+       DATA    ·table+0x6c(SB)/4, $0x455a14ed
+       DATA    ·table+0x70(SB)/4, $0xa9e3e905
+       DATA    ·table+0x74(SB)/4, $0xfcefa3f8
+       DATA    ·table+0x78(SB)/4, $0x676f02d9
+       DATA    ·table+0x7c(SB)/4, $0x8d2a4c8a
+       // Round 3
+       DATA    ·table+0x80(SB)/4, $0xfffa3942
+       DATA    ·table+0x84(SB)/4, $0x8771f681
+       DATA    ·table+0x88(SB)/4, $0x6d9d6122
+       DATA    ·table+0x8c(SB)/4, $0xfde5380c
+       DATA    ·table+0x90(SB)/4, $0xa4beea44
+       DATA    ·table+0x94(SB)/4, $0x4bdecfa9
+       DATA    ·table+0x98(SB)/4, $0xf6bb4b60
+       DATA    ·table+0x9c(SB)/4, $0xbebfbc70
+       DATA    ·table+0xa0(SB)/4, $0x289b7ec6
+       DATA    ·table+0xa4(SB)/4, $0xeaa127fa
+       DATA    ·table+0xa8(SB)/4, $0xd4ef3085
+       DATA    ·table+0xac(SB)/4, $0x04881d05
+       DATA    ·table+0xb0(SB)/4, $0xd9d4d039
+       DATA    ·table+0xb4(SB)/4, $0xe6db99e5
+       DATA    ·table+0xb8(SB)/4, $0x1fa27cf8
+       DATA    ·table+0xbc(SB)/4, $0xc4ac5665
+       // Round 4
+       DATA    ·table+0xc0(SB)/4, $0xf4292244
+       DATA    ·table+0xc4(SB)/4, $0x432aff97
+       DATA    ·table+0xc8(SB)/4, $0xab9423a7
+       DATA    ·table+0xcc(SB)/4, $0xfc93a039
+       DATA    ·table+0xd0(SB)/4, $0x655b59c3
+       DATA    ·table+0xd4(SB)/4, $0x8f0ccc92
+       DATA    ·table+0xd8(SB)/4, $0xffeff47d
+       DATA    ·table+0xdc(SB)/4, $0x85845dd1
+       DATA    ·table+0xe0(SB)/4, $0x6fa87e4f
+       DATA    ·table+0xe4(SB)/4, $0xfe2ce6e0
+       DATA    ·table+0xe8(SB)/4, $0xa3014314
+       DATA    ·table+0xec(SB)/4, $0x4e0811a1
+       DATA    ·table+0xf0(SB)/4, $0xf7537e82
+       DATA    ·table+0xf4(SB)/4, $0xbd3af235
+       DATA    ·table+0xf8(SB)/4, $0x2ad7d2bb
+       DATA    ·table+0xfc(SB)/4, $0xeb86d391
+       // Global definition
+       GLOBL   ·table(SB),8,$256
diff --git a/src/pkg/crypto/md5/md5block_decl.go b/src/pkg/crypto/md5/md5block_decl.go

index 3365200860a131310d9dc6b58dbf41a4482e5c20..c4d6aaaf03a1e33b46feac4bfda30c3639ae69f1 100644 (file)
--- a/src/pkg/crypto/md5/md5block_decl.go
+++ b/src/pkg/crypto/md5/md5block_decl.go
@@ -2,7 +2,7 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-// +build amd64 386
+// +build amd64 386 arm
  
  package md5
author	Nick Craig-Wood <nick@craig-wood.com>
	Thu, 25 Jul 2013 03:28:27 +0000 (13:28 +1000)
committer	Dave Cheney <dave@cheney.net>
	Thu, 25 Jul 2013 03:28:27 +0000 (13:28 +1000)
src/pkg/crypto/md5/gen.go		patch \| blob \| history
src/pkg/crypto/md5/md5_test.go		patch \| blob \| history
src/pkg/crypto/md5/md5block.go		patch \| blob \| history
src/pkg/crypto/md5/md5block_arm.s	[new file with mode: 0644]	patch \| blob
src/pkg/crypto/md5/md5block_decl.go		patch \| blob \| history