]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/md5: faster inner loop, 3x faster overall
authorRuss Cox <rsc@golang.org>
Tue, 22 May 2012 17:53:27 +0000 (13:53 -0400)
committerRuss Cox <rsc@golang.org>
Tue, 22 May 2012 17:53:27 +0000 (13:53 -0400)
The speedup is a combination of unrolling/specializing
the actual code and also making the compiler generate better code.

Go 1.0.1 (size: 1239 code + 320 data = 1559 total)
md5.BenchmarkHash1K   1000000    7178 ns/op  142.64 MB/s
md5.BenchmarkHash8K    200000   56834 ns/op  144.14 MB/s

Partial unroll  (size: 1115 code + 256 data = 1371 total)
md5.BenchmarkHash1K   5000000    2513 ns/op  407.37 MB/s
md5.BenchmarkHash8K    500000   19406 ns/op  422.13 MB/s

Complete unroll  (size: 1900 code + 0 data = 1900 code)
md5.BenchmarkHash1K   5000000    2442 ns/op  419.18 MB/s
md5.BenchmarkHash8K    500000   18957 ns/op  432.13 MB/s

Comparing Go 1.0.1 and the complete unroll (this CL):

benchmark               old MB/s     new MB/s  speedup
md5.BenchmarkHash1K       142.64       419.18    2.94x
md5.BenchmarkHash8K       144.14       432.13    3.00x

On the same machine, 'openssl speed md5' reports 441 MB/s
and 531 MB/s for our two cases, so this CL is at 90% and 80% of
those speeds, which is at least in the right ballpark.
OpenSSL is using carefully engineered assembly, so we are
unlikely to catch up completely.

Measurements on a Mid-2010 MacPro5,1.

R=golang-dev, bradfitz, agl
CC=golang-dev
https://golang.org/cl/6220046

src/pkg/crypto/md5/gen.go [new file with mode: 0644]
src/pkg/crypto/md5/md5_test.go
src/pkg/crypto/md5/md5block.go

diff --git a/src/pkg/crypto/md5/gen.go b/src/pkg/crypto/md5/gen.go
new file mode 100644 (file)
index 0000000..ffa43a3
--- /dev/null
@@ -0,0 +1,301 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// This program generates md5block.go
+// Invoke as
+//
+//     go run gen.go [-full] |gofmt >md5block.go
+//
+// The -full flag causes the generated code to do a full
+// (16x) unrolling instead of a 4x unrolling.
+
+package main
+
+import (
+       "flag"
+       "log"
+       "os"
+       "strings"
+       "text/template"
+)
+
+func main() {
+       flag.Parse()
+
+       t := template.Must(template.New("main").Funcs(funcs).Parse(program))
+       if err := t.Execute(os.Stdout, data); err != nil {
+               log.Fatal(err)
+       }
+}
+
+type Data struct {
+       a, b, c, d string
+       Shift1     []int
+       Shift2     []int
+       Shift3     []int
+       Shift4     []int
+       Table1     []uint32
+       Table2     []uint32
+       Table3     []uint32
+       Table4     []uint32
+       Full       bool
+}
+
+var funcs = template.FuncMap{
+       "dup":     dup,
+       "relabel": relabel,
+       "rotate":  rotate,
+}
+
+func dup(count int, x []int) []int {
+       var out []int
+       for i := 0; i < count; i++ {
+               out = append(out, x...)
+       }
+       return out
+}
+
+func relabel(s string) string {
+       return strings.NewReplacer("a", data.a, "b", data.b, "c", data.c, "d", data.d).Replace(s)
+}
+
+func rotate() string {
+       data.a, data.b, data.c, data.d = data.d, data.a, data.b, data.c
+       return "" // no output
+}
+
+func init() {
+       flag.BoolVar(&data.Full, "full", false, "complete unrolling")
+}
+
+var data = Data{
+       a:      "a",
+       b:      "b",
+       c:      "c",
+       d:      "d",
+       Shift1: []int{7, 12, 17, 22},
+       Shift2: []int{5, 9, 14, 20},
+       Shift3: []int{4, 11, 16, 23},
+       Shift4: []int{6, 10, 15, 21},
+
+       // table[i] = int((1<<32) * abs(sin(i+1 radians))).
+       Table1: []uint32{
+               // round 1
+               0xd76aa478,
+               0xe8c7b756,
+               0x242070db,
+               0xc1bdceee,
+               0xf57c0faf,
+               0x4787c62a,
+               0xa8304613,
+               0xfd469501,
+               0x698098d8,
+               0x8b44f7af,
+               0xffff5bb1,
+               0x895cd7be,
+               0x6b901122,
+               0xfd987193,
+               0xa679438e,
+               0x49b40821,
+       },
+       Table2: []uint32{
+               // round 2
+               0xf61e2562,
+               0xc040b340,
+               0x265e5a51,
+               0xe9b6c7aa,
+               0xd62f105d,
+               0x2441453,
+               0xd8a1e681,
+               0xe7d3fbc8,
+               0x21e1cde6,
+               0xc33707d6,
+               0xf4d50d87,
+               0x455a14ed,
+               0xa9e3e905,
+               0xfcefa3f8,
+               0x676f02d9,
+               0x8d2a4c8a,
+       },
+       Table3: []uint32{
+               // round3
+               0xfffa3942,
+               0x8771f681,
+               0x6d9d6122,
+               0xfde5380c,
+               0xa4beea44,
+               0x4bdecfa9,
+               0xf6bb4b60,
+               0xbebfbc70,
+               0x289b7ec6,
+               0xeaa127fa,
+               0xd4ef3085,
+               0x4881d05,
+               0xd9d4d039,
+               0xe6db99e5,
+               0x1fa27cf8,
+               0xc4ac5665,
+       },
+       Table4: []uint32{
+               // round 4
+               0xf4292244,
+               0x432aff97,
+               0xab9423a7,
+               0xfc93a039,
+               0x655b59c3,
+               0x8f0ccc92,
+               0xffeff47d,
+               0x85845dd1,
+               0x6fa87e4f,
+               0xfe2ce6e0,
+               0xa3014314,
+               0x4e0811a1,
+               0xf7537e82,
+               0xbd3af235,
+               0x2ad7d2bb,
+               0xeb86d391,
+       },
+}
+
+var program = `
+package md5
+
+import (
+       "unsafe"
+       "runtime"
+)
+
+{{if not .Full}}
+       var t1 = [...]uint32{
+       {{range .Table1}}{{printf "\t%#x,\n" .}}{{end}}
+       }
+       
+       var t2 = [...]uint32{
+       {{range .Table2}}{{printf "\t%#x,\n" .}}{{end}}
+       }
+       
+       var t3 = [...]uint32{
+       {{range .Table3}}{{printf "\t%#x,\n" .}}{{end}}
+       }
+       
+       var t4 = [...]uint32{
+       {{range .Table4}}{{printf "\t%#x,\n" .}}{{end}}
+       }
+{{end}}
+
+func _Block(dig *digest, p []byte) int {
+       a := dig.s[0]
+       b := dig.s[1]
+       c := dig.s[2]
+       d := dig.s[3]
+       n := 0
+       var X *[16]uint32
+       var xbuf [16]uint32
+       for len(p) >= _Chunk {
+               aa, bb, cc, dd := a, b, c, d
+
+               // This is a constant condition - it is not evaluated on each iteration.
+               if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+                       // MD5 was designed so that x86 processors can just iterate
+                       // over the block data directly as uint32s, and we generate
+                       // less code and run 1.3x faster if we take advantage of that.
+                       // My apologies.
+                       X = (*[16]uint32)(unsafe.Pointer(&p[0]))
+               } else {
+                       X = &xbuf
+                       j := 0
+                       for i := 0; i < 16; i++ {
+                               X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
+                               j += 4
+                       }
+               }
+
+               {{if .Full}}
+                       // Round 1.
+                       {{range $i, $s := dup 4 .Shift1}}
+                               {{index $.Table1 $i | printf "a += (((c^d)&b)^d) + X[%d] + %d" $i | relabel}}
+                               {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                               {{rotate}}
+                       {{end}}
+       
+                       // Round 2.
+                       {{range $i, $s := dup 4 .Shift2}}
+                               {{index $.Table2 $i | printf "a += (((b^c)&d)^c) + X[(1+5*%d)&15] + %d" $i | relabel}}
+                               {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                               {{rotate}}
+                       {{end}}
+       
+                       // Round 3.
+                       {{range $i, $s := dup 4 .Shift3}}
+                               {{index $.Table3 $i | printf "a += (b^c^d) + X[(5+3*%d)&15] + %d" $i | relabel}}
+                               {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                               {{rotate}}
+                       {{end}}
+       
+                       // Round 4.
+                       {{range $i, $s := dup 4 .Shift4}}
+                               {{index $.Table4 $i | printf "a += (c^(b|^d)) + X[(7*%d)&15] + %d" $i | relabel}}
+                               {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                               {{rotate}}
+                       {{end}}
+               {{else}}
+                       // Round 1.
+                       for i := uint(0); i < 16; {
+                               {{range $s := .Shift1}}
+                                       {{printf "a += (((c^d)&b)^d) + X[i&15] + t1[i&15]" | relabel}}
+                                       {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                                       i++
+                                       {{rotate}}
+                               {{end}}
+                       }
+       
+                       // Round 2.
+                       for i := uint(0); i < 16; {
+                               {{range $s := .Shift2}}
+                                       {{printf "a += (((b^c)&d)^c) + X[(1+5*i)&15] + t2[i&15]" | relabel}}
+                                       {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                                       i++
+                                       {{rotate}}
+                               {{end}}
+                       }
+       
+                       // Round 3.
+                       for i := uint(0); i < 16; {
+                               {{range $s := .Shift3}}
+                                       {{printf "a += (b^c^d) + X[(5+3*i)&15] + t3[i&15]" | relabel}}
+                                       {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                                       i++
+                                       {{rotate}}
+                               {{end}}
+                       }
+       
+                       // Round 4.
+                       for i := uint(0); i < 16; {
+                               {{range $s := .Shift4}}
+                                       {{printf "a += (c^(b|^d)) + X[(7*i)&15] + t4[i&15]" | relabel}}
+                                       {{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+                                       i++
+                                       {{rotate}}
+                               {{end}}
+                       }
+               {{end}}
+
+               a += aa
+               b += bb
+               c += cc
+               d += dd
+
+               p = p[_Chunk:]
+               n += _Chunk
+       }
+
+       dig.s[0] = a
+       dig.s[1] = b
+       dig.s[2] = c
+       dig.s[3] = d
+       return n
+}
+`
index aae875464f92b486f81ca9463f7cffc537933a2a..b474a90d5a3c4a341c17b1de1d8101f623950860 100644 (file)
@@ -78,3 +78,28 @@ func ExampleNew() {
        fmt.Printf("%x", h.Sum(nil))
        // Output: e2c569be17396eca2a2e3c11578123ed
 }
+
+var bench = md5.New()
+var buf = makeBuf()
+
+func makeBuf() []byte {
+       b := make([]byte, 8<<10)
+       for i := range b {
+               b[i] = byte(i)
+       }
+       return b
+}
+
+func BenchmarkHash1K(b *testing.B) {
+       b.SetBytes(1024)
+       for i := 0; i < b.N; i++ {
+               bench.Write(buf[:1024])
+       }
+}
+
+func BenchmarkHash8K(b *testing.B) {
+       b.SetBytes(int64(len(buf)))
+       for i := 0; i < b.N; i++ {
+               bench.Write(buf)
+       }
+}
index a887e2e05ed5402377658f45af46c6c8a5504e31..51758272c5138c393c7bc918e942bbef6976f7e0 100644 (file)
@@ -1,92 +1,9 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// MD5 block step.
-// In its own file so that a faster assembly or C version
-// can be substituted easily.
-
 package md5
 
-// table[i] = int((1<<32) * abs(sin(i+1 radians))).
-var table = []uint32{
-       // round 1
-       0xd76aa478,
-       0xe8c7b756,
-       0x242070db,
-       0xc1bdceee,
-       0xf57c0faf,
-       0x4787c62a,
-       0xa8304613,
-       0xfd469501,
-       0x698098d8,
-       0x8b44f7af,
-       0xffff5bb1,
-       0x895cd7be,
-       0x6b901122,
-       0xfd987193,
-       0xa679438e,
-       0x49b40821,
-
-       // round 2
-       0xf61e2562,
-       0xc040b340,
-       0x265e5a51,
-       0xe9b6c7aa,
-       0xd62f105d,
-       0x2441453,
-       0xd8a1e681,
-       0xe7d3fbc8,
-       0x21e1cde6,
-       0xc33707d6,
-       0xf4d50d87,
-       0x455a14ed,
-       0xa9e3e905,
-       0xfcefa3f8,
-       0x676f02d9,
-       0x8d2a4c8a,
-
-       // round3
-       0xfffa3942,
-       0x8771f681,
-       0x6d9d6122,
-       0xfde5380c,
-       0xa4beea44,
-       0x4bdecfa9,
-       0xf6bb4b60,
-       0xbebfbc70,
-       0x289b7ec6,
-       0xeaa127fa,
-       0xd4ef3085,
-       0x4881d05,
-       0xd9d4d039,
-       0xe6db99e5,
-       0x1fa27cf8,
-       0xc4ac5665,
-
-       // round 4
-       0xf4292244,
-       0x432aff97,
-       0xab9423a7,
-       0xfc93a039,
-       0x655b59c3,
-       0x8f0ccc92,
-       0xffeff47d,
-       0x85845dd1,
-       0x6fa87e4f,
-       0xfe2ce6e0,
-       0xa3014314,
-       0x4e0811a1,
-       0xf7537e82,
-       0xbd3af235,
-       0x2ad7d2bb,
-       0xeb86d391,
-}
-
-var shift1 = []uint{7, 12, 17, 22}
-var shift2 = []uint{5, 9, 14, 20}
-var shift3 = []uint{4, 11, 16, 23}
-var shift4 = []uint{6, 10, 15, 21}
+import (
+       "runtime"
+       "unsafe"
+)
 
 func _Block(dig *digest, p []byte) int {
        a := dig.s[0]
@@ -94,66 +11,226 @@ func _Block(dig *digest, p []byte) int {
        c := dig.s[2]
        d := dig.s[3]
        n := 0
-       var X [16]uint32
+       var X *[16]uint32
+       var xbuf [16]uint32
        for len(p) >= _Chunk {
                aa, bb, cc, dd := a, b, c, d
 
-               j := 0
-               for i := 0; i < 16; i++ {
-                       X[i] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
-                       j += 4
+               // This is a constant condition - it is not evaluated on each iteration.
+               if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+                       // MD5 was designed so that x86 processors can just iterate
+                       // over the block data directly as uint32s, and we generate
+                       // less code and run 1.3x faster if we take advantage of that.
+                       // My apologies.
+                       X = (*[16]uint32)(unsafe.Pointer(&p[0]))
+               } else {
+                       X = &xbuf
+                       j := 0
+                       for i := 0; i < 16; i++ {
+                               X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
+                               j += 4
+                       }
                }
 
-               // If this needs to be made faster in the future,
-               // the usual trick is to unroll each of these
-               // loops by a factor of 4; that lets you replace
-               // the shift[] lookups with constants and,
-               // with suitable variable renaming in each
-               // unrolled body, delete the a, b, c, d = d, a, b, c
-               // (or you can let the optimizer do the renaming).
-               //
-               // The index variables are uint so that % by a power
-               // of two can be optimized easily by a compiler.
-
                // Round 1.
-               for i := uint(0); i < 16; i++ {
-                       x := i
-                       s := shift1[i%4]
-                       f := ((c ^ d) & b) ^ d
-                       a += f + X[x] + table[i]
-                       a = a<<s | a>>(32-s) + b
-                       a, b, c, d = d, a, b, c
-               }
+
+               a += (((c ^ d) & b) ^ d) + X[0] + 3614090360
+               a = a<<7 | a>>(32-7) + b
+
+               d += (((b ^ c) & a) ^ c) + X[1] + 3905402710
+               d = d<<12 | d>>(32-12) + a
+
+               c += (((a ^ b) & d) ^ b) + X[2] + 606105819
+               c = c<<17 | c>>(32-17) + d
+
+               b += (((d ^ a) & c) ^ a) + X[3] + 3250441966
+               b = b<<22 | b>>(32-22) + c
+
+               a += (((c ^ d) & b) ^ d) + X[4] + 4118548399
+               a = a<<7 | a>>(32-7) + b
+
+               d += (((b ^ c) & a) ^ c) + X[5] + 1200080426
+               d = d<<12 | d>>(32-12) + a
+
+               c += (((a ^ b) & d) ^ b) + X[6] + 2821735955
+               c = c<<17 | c>>(32-17) + d
+
+               b += (((d ^ a) & c) ^ a) + X[7] + 4249261313
+               b = b<<22 | b>>(32-22) + c
+
+               a += (((c ^ d) & b) ^ d) + X[8] + 1770035416
+               a = a<<7 | a>>(32-7) + b
+
+               d += (((b ^ c) & a) ^ c) + X[9] + 2336552879
+               d = d<<12 | d>>(32-12) + a
+
+               c += (((a ^ b) & d) ^ b) + X[10] + 4294925233
+               c = c<<17 | c>>(32-17) + d
+
+               b += (((d ^ a) & c) ^ a) + X[11] + 2304563134
+               b = b<<22 | b>>(32-22) + c
+
+               a += (((c ^ d) & b) ^ d) + X[12] + 1804603682
+               a = a<<7 | a>>(32-7) + b
+
+               d += (((b ^ c) & a) ^ c) + X[13] + 4254626195
+               d = d<<12 | d>>(32-12) + a
+
+               c += (((a ^ b) & d) ^ b) + X[14] + 2792965006
+               c = c<<17 | c>>(32-17) + d
+
+               b += (((d ^ a) & c) ^ a) + X[15] + 1236535329
+               b = b<<22 | b>>(32-22) + c
 
                // Round 2.
-               for i := uint(0); i < 16; i++ {
-                       x := (1 + 5*i) % 16
-                       s := shift2[i%4]
-                       g := ((b ^ c) & d) ^ c
-                       a += g + X[x] + table[i+16]
-                       a = a<<s | a>>(32-s) + b
-                       a, b, c, d = d, a, b, c
-               }
+
+               a += (((b ^ c) & d) ^ c) + X[(1+5*0)&15] + 4129170786
+               a = a<<5 | a>>(32-5) + b
+
+               d += (((a ^ b) & c) ^ b) + X[(1+5*1)&15] + 3225465664
+               d = d<<9 | d>>(32-9) + a
+
+               c += (((d ^ a) & b) ^ a) + X[(1+5*2)&15] + 643717713
+               c = c<<14 | c>>(32-14) + d
+
+               b += (((c ^ d) & a) ^ d) + X[(1+5*3)&15] + 3921069994
+               b = b<<20 | b>>(32-20) + c
+
+               a += (((b ^ c) & d) ^ c) + X[(1+5*4)&15] + 3593408605
+               a = a<<5 | a>>(32-5) + b
+
+               d += (((a ^ b) & c) ^ b) + X[(1+5*5)&15] + 38016083
+               d = d<<9 | d>>(32-9) + a
+
+               c += (((d ^ a) & b) ^ a) + X[(1+5*6)&15] + 3634488961
+               c = c<<14 | c>>(32-14) + d
+
+               b += (((c ^ d) & a) ^ d) + X[(1+5*7)&15] + 3889429448
+               b = b<<20 | b>>(32-20) + c
+
+               a += (((b ^ c) & d) ^ c) + X[(1+5*8)&15] + 568446438
+               a = a<<5 | a>>(32-5) + b
+
+               d += (((a ^ b) & c) ^ b) + X[(1+5*9)&15] + 3275163606
+               d = d<<9 | d>>(32-9) + a
+
+               c += (((d ^ a) & b) ^ a) + X[(1+5*10)&15] + 4107603335
+               c = c<<14 | c>>(32-14) + d
+
+               b += (((c ^ d) & a) ^ d) + X[(1+5*11)&15] + 1163531501
+               b = b<<20 | b>>(32-20) + c
+
+               a += (((b ^ c) & d) ^ c) + X[(1+5*12)&15] + 2850285829
+               a = a<<5 | a>>(32-5) + b
+
+               d += (((a ^ b) & c) ^ b) + X[(1+5*13)&15] + 4243563512
+               d = d<<9 | d>>(32-9) + a
+
+               c += (((d ^ a) & b) ^ a) + X[(1+5*14)&15] + 1735328473
+               c = c<<14 | c>>(32-14) + d
+
+               b += (((c ^ d) & a) ^ d) + X[(1+5*15)&15] + 2368359562
+               b = b<<20 | b>>(32-20) + c
 
                // Round 3.
-               for i := uint(0); i < 16; i++ {
-                       x := (5 + 3*i) % 16
-                       s := shift3[i%4]
-                       h := b ^ c ^ d
-                       a += h + X[x] + table[i+32]
-                       a = a<<s | a>>(32-s) + b
-                       a, b, c, d = d, a, b, c
-               }
+
+               a += (b ^ c ^ d) + X[(5+3*0)&15] + 4294588738
+               a = a<<4 | a>>(32-4) + b
+
+               d += (a ^ b ^ c) + X[(5+3*1)&15] + 2272392833
+               d = d<<11 | d>>(32-11) + a
+
+               c += (d ^ a ^ b) + X[(5+3*2)&15] + 1839030562
+               c = c<<16 | c>>(32-16) + d
+
+               b += (c ^ d ^ a) + X[(5+3*3)&15] + 4259657740
+               b = b<<23 | b>>(32-23) + c
+
+               a += (b ^ c ^ d) + X[(5+3*4)&15] + 2763975236
+               a = a<<4 | a>>(32-4) + b
+
+               d += (a ^ b ^ c) + X[(5+3*5)&15] + 1272893353
+               d = d<<11 | d>>(32-11) + a
+
+               c += (d ^ a ^ b) + X[(5+3*6)&15] + 4139469664
+               c = c<<16 | c>>(32-16) + d
+
+               b += (c ^ d ^ a) + X[(5+3*7)&15] + 3200236656
+               b = b<<23 | b>>(32-23) + c
+
+               a += (b ^ c ^ d) + X[(5+3*8)&15] + 681279174
+               a = a<<4 | a>>(32-4) + b
+
+               d += (a ^ b ^ c) + X[(5+3*9)&15] + 3936430074
+               d = d<<11 | d>>(32-11) + a
+
+               c += (d ^ a ^ b) + X[(5+3*10)&15] + 3572445317
+               c = c<<16 | c>>(32-16) + d
+
+               b += (c ^ d ^ a) + X[(5+3*11)&15] + 76029189
+               b = b<<23 | b>>(32-23) + c
+
+               a += (b ^ c ^ d) + X[(5+3*12)&15] + 3654602809
+               a = a<<4 | a>>(32-4) + b
+
+               d += (a ^ b ^ c) + X[(5+3*13)&15] + 3873151461
+               d = d<<11 | d>>(32-11) + a
+
+               c += (d ^ a ^ b) + X[(5+3*14)&15] + 530742520
+               c = c<<16 | c>>(32-16) + d
+
+               b += (c ^ d ^ a) + X[(5+3*15)&15] + 3299628645
+               b = b<<23 | b>>(32-23) + c
 
                // Round 4.
-               for i := uint(0); i < 16; i++ {
-                       x := (7 * i) % 16
-                       s := shift4[i%4]
-                       j := c ^ (b | ^d)
-                       a += j + X[x] + table[i+48]
-                       a = a<<s | a>>(32-s) + b
-                       a, b, c, d = d, a, b, c
-               }
+
+               a += (c ^ (b | ^d)) + X[(7*0)&15] + 4096336452
+               a = a<<6 | a>>(32-6) + b
+
+               d += (b ^ (a | ^c)) + X[(7*1)&15] + 1126891415
+               d = d<<10 | d>>(32-10) + a
+
+               c += (a ^ (d | ^b)) + X[(7*2)&15] + 2878612391
+               c = c<<15 | c>>(32-15) + d
+
+               b += (d ^ (c | ^a)) + X[(7*3)&15] + 4237533241
+               b = b<<21 | b>>(32-21) + c
+
+               a += (c ^ (b | ^d)) + X[(7*4)&15] + 1700485571
+               a = a<<6 | a>>(32-6) + b
+
+               d += (b ^ (a | ^c)) + X[(7*5)&15] + 2399980690
+               d = d<<10 | d>>(32-10) + a
+
+               c += (a ^ (d | ^b)) + X[(7*6)&15] + 4293915773
+               c = c<<15 | c>>(32-15) + d
+
+               b += (d ^ (c | ^a)) + X[(7*7)&15] + 2240044497
+               b = b<<21 | b>>(32-21) + c
+
+               a += (c ^ (b | ^d)) + X[(7*8)&15] + 1873313359
+               a = a<<6 | a>>(32-6) + b
+
+               d += (b ^ (a | ^c)) + X[(7*9)&15] + 4264355552
+               d = d<<10 | d>>(32-10) + a
+
+               c += (a ^ (d | ^b)) + X[(7*10)&15] + 2734768916
+               c = c<<15 | c>>(32-15) + d
+
+               b += (d ^ (c | ^a)) + X[(7*11)&15] + 1309151649
+               b = b<<21 | b>>(32-21) + c
+
+               a += (c ^ (b | ^d)) + X[(7*12)&15] + 4149444226
+               a = a<<6 | a>>(32-6) + b
+
+               d += (b ^ (a | ^c)) + X[(7*13)&15] + 3174756917
+               d = d<<10 | d>>(32-10) + a
+
+               c += (a ^ (d | ^b)) + X[(7*14)&15] + 718787259
+               c = c<<15 | c>>(32-15) + d
+
+               b += (d ^ (c | ^a)) + X[(7*15)&15] + 3951481745
+               b = b<<21 | b>>(32-21) + c
 
                a += aa
                b += bb