--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// This program generates md5block.go
+// Invoke as
+//
+// go run gen.go [-full] |gofmt >md5block.go
+//
+// The -full flag causes the generated code to do a full
+// (16x) unrolling instead of a 4x unrolling.
+
+package main
+
+import (
+ "flag"
+ "log"
+ "os"
+ "strings"
+ "text/template"
+)
+
+func main() {
+ flag.Parse()
+
+ t := template.Must(template.New("main").Funcs(funcs).Parse(program))
+ if err := t.Execute(os.Stdout, data); err != nil {
+ log.Fatal(err)
+ }
+}
+
+type Data struct {
+ a, b, c, d string
+ Shift1 []int
+ Shift2 []int
+ Shift3 []int
+ Shift4 []int
+ Table1 []uint32
+ Table2 []uint32
+ Table3 []uint32
+ Table4 []uint32
+ Full bool
+}
+
+var funcs = template.FuncMap{
+ "dup": dup,
+ "relabel": relabel,
+ "rotate": rotate,
+}
+
+func dup(count int, x []int) []int {
+ var out []int
+ for i := 0; i < count; i++ {
+ out = append(out, x...)
+ }
+ return out
+}
+
+func relabel(s string) string {
+ return strings.NewReplacer("a", data.a, "b", data.b, "c", data.c, "d", data.d).Replace(s)
+}
+
+func rotate() string {
+ data.a, data.b, data.c, data.d = data.d, data.a, data.b, data.c
+ return "" // no output
+}
+
+func init() {
+ flag.BoolVar(&data.Full, "full", false, "complete unrolling")
+}
+
+var data = Data{
+ a: "a",
+ b: "b",
+ c: "c",
+ d: "d",
+ Shift1: []int{7, 12, 17, 22},
+ Shift2: []int{5, 9, 14, 20},
+ Shift3: []int{4, 11, 16, 23},
+ Shift4: []int{6, 10, 15, 21},
+
+ // table[i] = int((1<<32) * abs(sin(i+1 radians))).
+ Table1: []uint32{
+ // round 1
+ 0xd76aa478,
+ 0xe8c7b756,
+ 0x242070db,
+ 0xc1bdceee,
+ 0xf57c0faf,
+ 0x4787c62a,
+ 0xa8304613,
+ 0xfd469501,
+ 0x698098d8,
+ 0x8b44f7af,
+ 0xffff5bb1,
+ 0x895cd7be,
+ 0x6b901122,
+ 0xfd987193,
+ 0xa679438e,
+ 0x49b40821,
+ },
+ Table2: []uint32{
+ // round 2
+ 0xf61e2562,
+ 0xc040b340,
+ 0x265e5a51,
+ 0xe9b6c7aa,
+ 0xd62f105d,
+ 0x2441453,
+ 0xd8a1e681,
+ 0xe7d3fbc8,
+ 0x21e1cde6,
+ 0xc33707d6,
+ 0xf4d50d87,
+ 0x455a14ed,
+ 0xa9e3e905,
+ 0xfcefa3f8,
+ 0x676f02d9,
+ 0x8d2a4c8a,
+ },
+ Table3: []uint32{
+ // round3
+ 0xfffa3942,
+ 0x8771f681,
+ 0x6d9d6122,
+ 0xfde5380c,
+ 0xa4beea44,
+ 0x4bdecfa9,
+ 0xf6bb4b60,
+ 0xbebfbc70,
+ 0x289b7ec6,
+ 0xeaa127fa,
+ 0xd4ef3085,
+ 0x4881d05,
+ 0xd9d4d039,
+ 0xe6db99e5,
+ 0x1fa27cf8,
+ 0xc4ac5665,
+ },
+ Table4: []uint32{
+ // round 4
+ 0xf4292244,
+ 0x432aff97,
+ 0xab9423a7,
+ 0xfc93a039,
+ 0x655b59c3,
+ 0x8f0ccc92,
+ 0xffeff47d,
+ 0x85845dd1,
+ 0x6fa87e4f,
+ 0xfe2ce6e0,
+ 0xa3014314,
+ 0x4e0811a1,
+ 0xf7537e82,
+ 0xbd3af235,
+ 0x2ad7d2bb,
+ 0xeb86d391,
+ },
+}
+
+var program = `
+package md5
+
+import (
+ "unsafe"
+ "runtime"
+)
+
+{{if not .Full}}
+ var t1 = [...]uint32{
+ {{range .Table1}}{{printf "\t%#x,\n" .}}{{end}}
+ }
+
+ var t2 = [...]uint32{
+ {{range .Table2}}{{printf "\t%#x,\n" .}}{{end}}
+ }
+
+ var t3 = [...]uint32{
+ {{range .Table3}}{{printf "\t%#x,\n" .}}{{end}}
+ }
+
+ var t4 = [...]uint32{
+ {{range .Table4}}{{printf "\t%#x,\n" .}}{{end}}
+ }
+{{end}}
+
+func _Block(dig *digest, p []byte) int {
+ a := dig.s[0]
+ b := dig.s[1]
+ c := dig.s[2]
+ d := dig.s[3]
+ n := 0
+ var X *[16]uint32
+ var xbuf [16]uint32
+ for len(p) >= _Chunk {
+ aa, bb, cc, dd := a, b, c, d
+
+ // This is a constant condition - it is not evaluated on each iteration.
+ if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+ // MD5 was designed so that x86 processors can just iterate
+ // over the block data directly as uint32s, and we generate
+ // less code and run 1.3x faster if we take advantage of that.
+ // My apologies.
+ X = (*[16]uint32)(unsafe.Pointer(&p[0]))
+ } else {
+ X = &xbuf
+ j := 0
+ for i := 0; i < 16; i++ {
+ X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
+ j += 4
+ }
+ }
+
+ {{if .Full}}
+ // Round 1.
+ {{range $i, $s := dup 4 .Shift1}}
+ {{index $.Table1 $i | printf "a += (((c^d)&b)^d) + X[%d] + %d" $i | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ {{rotate}}
+ {{end}}
+
+ // Round 2.
+ {{range $i, $s := dup 4 .Shift2}}
+ {{index $.Table2 $i | printf "a += (((b^c)&d)^c) + X[(1+5*%d)&15] + %d" $i | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ {{rotate}}
+ {{end}}
+
+ // Round 3.
+ {{range $i, $s := dup 4 .Shift3}}
+ {{index $.Table3 $i | printf "a += (b^c^d) + X[(5+3*%d)&15] + %d" $i | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ {{rotate}}
+ {{end}}
+
+ // Round 4.
+ {{range $i, $s := dup 4 .Shift4}}
+ {{index $.Table4 $i | printf "a += (c^(b|^d)) + X[(7*%d)&15] + %d" $i | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ {{rotate}}
+ {{end}}
+ {{else}}
+ // Round 1.
+ for i := uint(0); i < 16; {
+ {{range $s := .Shift1}}
+ {{printf "a += (((c^d)&b)^d) + X[i&15] + t1[i&15]" | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ i++
+ {{rotate}}
+ {{end}}
+ }
+
+ // Round 2.
+ for i := uint(0); i < 16; {
+ {{range $s := .Shift2}}
+ {{printf "a += (((b^c)&d)^c) + X[(1+5*i)&15] + t2[i&15]" | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ i++
+ {{rotate}}
+ {{end}}
+ }
+
+ // Round 3.
+ for i := uint(0); i < 16; {
+ {{range $s := .Shift3}}
+ {{printf "a += (b^c^d) + X[(5+3*i)&15] + t3[i&15]" | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ i++
+ {{rotate}}
+ {{end}}
+ }
+
+ // Round 4.
+ for i := uint(0); i < 16; {
+ {{range $s := .Shift4}}
+ {{printf "a += (c^(b|^d)) + X[(7*i)&15] + t4[i&15]" | relabel}}
+ {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+ i++
+ {{rotate}}
+ {{end}}
+ }
+ {{end}}
+
+ a += aa
+ b += bb
+ c += cc
+ d += dd
+
+ p = p[_Chunk:]
+ n += _Chunk
+ }
+
+ dig.s[0] = a
+ dig.s[1] = b
+ dig.s[2] = c
+ dig.s[3] = d
+ return n
+}
+`
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// MD5 block step.
-// In its own file so that a faster assembly or C version
-// can be substituted easily.
-
package md5
-// table[i] = int((1<<32) * abs(sin(i+1 radians))).
-var table = []uint32{
- // round 1
- 0xd76aa478,
- 0xe8c7b756,
- 0x242070db,
- 0xc1bdceee,
- 0xf57c0faf,
- 0x4787c62a,
- 0xa8304613,
- 0xfd469501,
- 0x698098d8,
- 0x8b44f7af,
- 0xffff5bb1,
- 0x895cd7be,
- 0x6b901122,
- 0xfd987193,
- 0xa679438e,
- 0x49b40821,
-
- // round 2
- 0xf61e2562,
- 0xc040b340,
- 0x265e5a51,
- 0xe9b6c7aa,
- 0xd62f105d,
- 0x2441453,
- 0xd8a1e681,
- 0xe7d3fbc8,
- 0x21e1cde6,
- 0xc33707d6,
- 0xf4d50d87,
- 0x455a14ed,
- 0xa9e3e905,
- 0xfcefa3f8,
- 0x676f02d9,
- 0x8d2a4c8a,
-
- // round3
- 0xfffa3942,
- 0x8771f681,
- 0x6d9d6122,
- 0xfde5380c,
- 0xa4beea44,
- 0x4bdecfa9,
- 0xf6bb4b60,
- 0xbebfbc70,
- 0x289b7ec6,
- 0xeaa127fa,
- 0xd4ef3085,
- 0x4881d05,
- 0xd9d4d039,
- 0xe6db99e5,
- 0x1fa27cf8,
- 0xc4ac5665,
-
- // round 4
- 0xf4292244,
- 0x432aff97,
- 0xab9423a7,
- 0xfc93a039,
- 0x655b59c3,
- 0x8f0ccc92,
- 0xffeff47d,
- 0x85845dd1,
- 0x6fa87e4f,
- 0xfe2ce6e0,
- 0xa3014314,
- 0x4e0811a1,
- 0xf7537e82,
- 0xbd3af235,
- 0x2ad7d2bb,
- 0xeb86d391,
-}
-
-var shift1 = []uint{7, 12, 17, 22}
-var shift2 = []uint{5, 9, 14, 20}
-var shift3 = []uint{4, 11, 16, 23}
-var shift4 = []uint{6, 10, 15, 21}
+import (
+ "runtime"
+ "unsafe"
+)
func _Block(dig *digest, p []byte) int {
a := dig.s[0]
c := dig.s[2]
d := dig.s[3]
n := 0
- var X [16]uint32
+ var X *[16]uint32
+ var xbuf [16]uint32
for len(p) >= _Chunk {
aa, bb, cc, dd := a, b, c, d
- j := 0
- for i := 0; i < 16; i++ {
- X[i] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
- j += 4
+ // This is a constant condition - it is not evaluated on each iteration.
+ if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+ // MD5 was designed so that x86 processors can just iterate
+ // over the block data directly as uint32s, and we generate
+ // less code and run 1.3x faster if we take advantage of that.
+ // My apologies.
+ X = (*[16]uint32)(unsafe.Pointer(&p[0]))
+ } else {
+ X = &xbuf
+ j := 0
+ for i := 0; i < 16; i++ {
+ X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
+ j += 4
+ }
}
- // If this needs to be made faster in the future,
- // the usual trick is to unroll each of these
- // loops by a factor of 4; that lets you replace
- // the shift[] lookups with constants and,
- // with suitable variable renaming in each
- // unrolled body, delete the a, b, c, d = d, a, b, c
- // (or you can let the optimizer do the renaming).
- //
- // The index variables are uint so that % by a power
- // of two can be optimized easily by a compiler.
-
// Round 1.
- for i := uint(0); i < 16; i++ {
- x := i
- s := shift1[i%4]
- f := ((c ^ d) & b) ^ d
- a += f + X[x] + table[i]
- a = a<<s | a>>(32-s) + b
- a, b, c, d = d, a, b, c
- }
+
+ a += (((c ^ d) & b) ^ d) + X[0] + 3614090360
+ a = a<<7 | a>>(32-7) + b
+
+ d += (((b ^ c) & a) ^ c) + X[1] + 3905402710
+ d = d<<12 | d>>(32-12) + a
+
+ c += (((a ^ b) & d) ^ b) + X[2] + 606105819
+ c = c<<17 | c>>(32-17) + d
+
+ b += (((d ^ a) & c) ^ a) + X[3] + 3250441966
+ b = b<<22 | b>>(32-22) + c
+
+ a += (((c ^ d) & b) ^ d) + X[4] + 4118548399
+ a = a<<7 | a>>(32-7) + b
+
+ d += (((b ^ c) & a) ^ c) + X[5] + 1200080426
+ d = d<<12 | d>>(32-12) + a
+
+ c += (((a ^ b) & d) ^ b) + X[6] + 2821735955
+ c = c<<17 | c>>(32-17) + d
+
+ b += (((d ^ a) & c) ^ a) + X[7] + 4249261313
+ b = b<<22 | b>>(32-22) + c
+
+ a += (((c ^ d) & b) ^ d) + X[8] + 1770035416
+ a = a<<7 | a>>(32-7) + b
+
+ d += (((b ^ c) & a) ^ c) + X[9] + 2336552879
+ d = d<<12 | d>>(32-12) + a
+
+ c += (((a ^ b) & d) ^ b) + X[10] + 4294925233
+ c = c<<17 | c>>(32-17) + d
+
+ b += (((d ^ a) & c) ^ a) + X[11] + 2304563134
+ b = b<<22 | b>>(32-22) + c
+
+ a += (((c ^ d) & b) ^ d) + X[12] + 1804603682
+ a = a<<7 | a>>(32-7) + b
+
+ d += (((b ^ c) & a) ^ c) + X[13] + 4254626195
+ d = d<<12 | d>>(32-12) + a
+
+ c += (((a ^ b) & d) ^ b) + X[14] + 2792965006
+ c = c<<17 | c>>(32-17) + d
+
+ b += (((d ^ a) & c) ^ a) + X[15] + 1236535329
+ b = b<<22 | b>>(32-22) + c
// Round 2.
- for i := uint(0); i < 16; i++ {
- x := (1 + 5*i) % 16
- s := shift2[i%4]
- g := ((b ^ c) & d) ^ c
- a += g + X[x] + table[i+16]
- a = a<<s | a>>(32-s) + b
- a, b, c, d = d, a, b, c
- }
+
+ a += (((b ^ c) & d) ^ c) + X[(1+5*0)&15] + 4129170786
+ a = a<<5 | a>>(32-5) + b
+
+ d += (((a ^ b) & c) ^ b) + X[(1+5*1)&15] + 3225465664
+ d = d<<9 | d>>(32-9) + a
+
+ c += (((d ^ a) & b) ^ a) + X[(1+5*2)&15] + 643717713
+ c = c<<14 | c>>(32-14) + d
+
+ b += (((c ^ d) & a) ^ d) + X[(1+5*3)&15] + 3921069994
+ b = b<<20 | b>>(32-20) + c
+
+ a += (((b ^ c) & d) ^ c) + X[(1+5*4)&15] + 3593408605
+ a = a<<5 | a>>(32-5) + b
+
+ d += (((a ^ b) & c) ^ b) + X[(1+5*5)&15] + 38016083
+ d = d<<9 | d>>(32-9) + a
+
+ c += (((d ^ a) & b) ^ a) + X[(1+5*6)&15] + 3634488961
+ c = c<<14 | c>>(32-14) + d
+
+ b += (((c ^ d) & a) ^ d) + X[(1+5*7)&15] + 3889429448
+ b = b<<20 | b>>(32-20) + c
+
+ a += (((b ^ c) & d) ^ c) + X[(1+5*8)&15] + 568446438
+ a = a<<5 | a>>(32-5) + b
+
+ d += (((a ^ b) & c) ^ b) + X[(1+5*9)&15] + 3275163606
+ d = d<<9 | d>>(32-9) + a
+
+ c += (((d ^ a) & b) ^ a) + X[(1+5*10)&15] + 4107603335
+ c = c<<14 | c>>(32-14) + d
+
+ b += (((c ^ d) & a) ^ d) + X[(1+5*11)&15] + 1163531501
+ b = b<<20 | b>>(32-20) + c
+
+ a += (((b ^ c) & d) ^ c) + X[(1+5*12)&15] + 2850285829
+ a = a<<5 | a>>(32-5) + b
+
+ d += (((a ^ b) & c) ^ b) + X[(1+5*13)&15] + 4243563512
+ d = d<<9 | d>>(32-9) + a
+
+ c += (((d ^ a) & b) ^ a) + X[(1+5*14)&15] + 1735328473
+ c = c<<14 | c>>(32-14) + d
+
+ b += (((c ^ d) & a) ^ d) + X[(1+5*15)&15] + 2368359562
+ b = b<<20 | b>>(32-20) + c
// Round 3.
- for i := uint(0); i < 16; i++ {
- x := (5 + 3*i) % 16
- s := shift3[i%4]
- h := b ^ c ^ d
- a += h + X[x] + table[i+32]
- a = a<<s | a>>(32-s) + b
- a, b, c, d = d, a, b, c
- }
+
+ a += (b ^ c ^ d) + X[(5+3*0)&15] + 4294588738
+ a = a<<4 | a>>(32-4) + b
+
+ d += (a ^ b ^ c) + X[(5+3*1)&15] + 2272392833
+ d = d<<11 | d>>(32-11) + a
+
+ c += (d ^ a ^ b) + X[(5+3*2)&15] + 1839030562
+ c = c<<16 | c>>(32-16) + d
+
+ b += (c ^ d ^ a) + X[(5+3*3)&15] + 4259657740
+ b = b<<23 | b>>(32-23) + c
+
+ a += (b ^ c ^ d) + X[(5+3*4)&15] + 2763975236
+ a = a<<4 | a>>(32-4) + b
+
+ d += (a ^ b ^ c) + X[(5+3*5)&15] + 1272893353
+ d = d<<11 | d>>(32-11) + a
+
+ c += (d ^ a ^ b) + X[(5+3*6)&15] + 4139469664
+ c = c<<16 | c>>(32-16) + d
+
+ b += (c ^ d ^ a) + X[(5+3*7)&15] + 3200236656
+ b = b<<23 | b>>(32-23) + c
+
+ a += (b ^ c ^ d) + X[(5+3*8)&15] + 681279174
+ a = a<<4 | a>>(32-4) + b
+
+ d += (a ^ b ^ c) + X[(5+3*9)&15] + 3936430074
+ d = d<<11 | d>>(32-11) + a
+
+ c += (d ^ a ^ b) + X[(5+3*10)&15] + 3572445317
+ c = c<<16 | c>>(32-16) + d
+
+ b += (c ^ d ^ a) + X[(5+3*11)&15] + 76029189
+ b = b<<23 | b>>(32-23) + c
+
+ a += (b ^ c ^ d) + X[(5+3*12)&15] + 3654602809
+ a = a<<4 | a>>(32-4) + b
+
+ d += (a ^ b ^ c) + X[(5+3*13)&15] + 3873151461
+ d = d<<11 | d>>(32-11) + a
+
+ c += (d ^ a ^ b) + X[(5+3*14)&15] + 530742520
+ c = c<<16 | c>>(32-16) + d
+
+ b += (c ^ d ^ a) + X[(5+3*15)&15] + 3299628645
+ b = b<<23 | b>>(32-23) + c
// Round 4.
- for i := uint(0); i < 16; i++ {
- x := (7 * i) % 16
- s := shift4[i%4]
- j := c ^ (b | ^d)
- a += j + X[x] + table[i+48]
- a = a<<s | a>>(32-s) + b
- a, b, c, d = d, a, b, c
- }
+
+ a += (c ^ (b | ^d)) + X[(7*0)&15] + 4096336452
+ a = a<<6 | a>>(32-6) + b
+
+ d += (b ^ (a | ^c)) + X[(7*1)&15] + 1126891415
+ d = d<<10 | d>>(32-10) + a
+
+ c += (a ^ (d | ^b)) + X[(7*2)&15] + 2878612391
+ c = c<<15 | c>>(32-15) + d
+
+ b += (d ^ (c | ^a)) + X[(7*3)&15] + 4237533241
+ b = b<<21 | b>>(32-21) + c
+
+ a += (c ^ (b | ^d)) + X[(7*4)&15] + 1700485571
+ a = a<<6 | a>>(32-6) + b
+
+ d += (b ^ (a | ^c)) + X[(7*5)&15] + 2399980690
+ d = d<<10 | d>>(32-10) + a
+
+ c += (a ^ (d | ^b)) + X[(7*6)&15] + 4293915773
+ c = c<<15 | c>>(32-15) + d
+
+ b += (d ^ (c | ^a)) + X[(7*7)&15] + 2240044497
+ b = b<<21 | b>>(32-21) + c
+
+ a += (c ^ (b | ^d)) + X[(7*8)&15] + 1873313359
+ a = a<<6 | a>>(32-6) + b
+
+ d += (b ^ (a | ^c)) + X[(7*9)&15] + 4264355552
+ d = d<<10 | d>>(32-10) + a
+
+ c += (a ^ (d | ^b)) + X[(7*10)&15] + 2734768916
+ c = c<<15 | c>>(32-15) + d
+
+ b += (d ^ (c | ^a)) + X[(7*11)&15] + 1309151649
+ b = b<<21 | b>>(32-21) + c
+
+ a += (c ^ (b | ^d)) + X[(7*12)&15] + 4149444226
+ a = a<<6 | a>>(32-6) + b
+
+ d += (b ^ (a | ^c)) + X[(7*13)&15] + 3174756917
+ d = d<<10 | d>>(32-10) + a
+
+ c += (a ^ (d | ^b)) + X[(7*14)&15] + 718787259
+ c = c<<15 | c>>(32-15) + d
+
+ b += (d ^ (c | ^a)) + X[(7*15)&15] + 3951481745
+ b = b<<21 | b>>(32-21) + c
a += aa
b += bb