From 98b6febcef8f6d7411a77e9e828df681871a28ad Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Thu, 14 Apr 2016 10:28:35 -0700 Subject: [PATCH] runtime/internal/sys: better fallback algorithms for intrinsics Use deBruijn sequences to count low-order zeros. Reorg bswap to not use &^, it takes another instruction on x86. Change-Id: I4a5ed9fd16ee6a279d88c067e8a2ba11de821156 Reviewed-on: https://go-review.googlesource.com/22084 Reviewed-by: David Chase --- src/runtime/internal/sys/intrinsics.go | 135 +++++++++++--------- src/runtime/internal/sys/intrinsics_test.go | 54 ++++++++ 2 files changed, 126 insertions(+), 63 deletions(-) create mode 100644 src/runtime/internal/sys/intrinsics_test.go diff --git a/src/runtime/internal/sys/intrinsics.go b/src/runtime/internal/sys/intrinsics.go index 8feb754dbd..1054c6948f 100644 --- a/src/runtime/internal/sys/intrinsics.go +++ b/src/runtime/internal/sys/intrinsics.go @@ -4,88 +4,97 @@ package sys +// Using techniques from http://supertech.csail.mit.edu/papers/debruijn.pdf + +const deBruijn64 = 0x0218a392cd3d5dbf + +var deBruijnIdx64 = [64]byte{ + 0, 1, 2, 7, 3, 13, 8, 19, + 4, 25, 14, 28, 9, 34, 20, 40, + 5, 17, 26, 38, 15, 46, 29, 48, + 10, 31, 35, 54, 21, 50, 41, 57, + 63, 6, 12, 18, 24, 27, 33, 39, + 16, 37, 45, 47, 30, 53, 49, 56, + 62, 11, 23, 32, 36, 44, 52, 55, + 61, 22, 43, 51, 60, 42, 59, 58, +} + +const deBruijn32 = 0x04653adf + +var deBruijnIdx32 = [32]byte{ + 0, 1, 2, 6, 3, 11, 7, 16, + 4, 14, 12, 21, 8, 23, 17, 26, + 31, 5, 10, 15, 13, 20, 22, 25, + 30, 9, 19, 24, 29, 18, 28, 27, +} + +const deBruijn16 = 0x09af + +var deBruijnIdx16 = [16]byte{ + 0, 1, 2, 5, 3, 9, 6, 11, + 15, 4, 8, 10, 14, 7, 13, 12, +} + +const deBruijn8 = 0x17 + +var deBruijnIdx8 = [8]byte{ + 0, 1, 2, 4, 7, 3, 6, 5, +} + // Ctz64 counts trailing (low-order) zeroes, // and if all are zero, then 64. func Ctz64(x uint64) uint64 { - if x&0xffffffff == 0 { - return 32 + uint64(Ctz32(uint32(x>>32))) - } - return uint64(Ctz32(uint32(x))) - + x &= -x // isolate low-order bit + y := x * deBruijn64 >> 58 // extract part of deBruijn sequence + y = uint64(deBruijnIdx64[y]) // convert to bit index + z := (x - 1) >> 57 & 64 // adjustment if zero + return y + z } // Ctz32 counts trailing (low-order) zeroes, // and if all are zero, then 32. func Ctz32(x uint32) uint32 { - if x&0xffff == 0 { - return 16 + uint32(Ctz16(uint16(x>>16))) - } - return uint32(Ctz16(uint16(x))) + x &= -x // isolate low-order bit + y := x * deBruijn32 >> 27 // extract part of deBruijn sequence + y = uint32(deBruijnIdx32[y]) // convert to bit index + z := (x - 1) >> 26 & 32 // adjustment if zero + return y + z } // Ctz16 counts trailing (low-order) zeroes, // and if all are zero, then 16. func Ctz16(x uint16) uint16 { - if x&0xff == 0 { - return 8 + uint16(Ctz8(uint8(x>>8))) - } - return uint16(Ctz8(uint8(x))) + x &= -x // isolate low-order bit + y := x * deBruijn16 >> 12 // extract part of deBruijn sequence + y = uint16(deBruijnIdx16[y]) // convert to bit index + z := (x - 1) >> 11 & 16 // adjustment if zero + return y + z } // Ctz8 counts trailing (low-order) zeroes, // and if all are zero, then 8. func Ctz8(x uint8) uint8 { - return ctzVals[x] + x &= -x // isolate low-order bit + y := x * deBruijn8 >> 5 // extract part of deBruijn sequence + y = uint8(deBruijnIdx8[y]) // convert to bit index + z := (x - 1) >> 4 & 8 // adjustment if zero + return y + z } -var ctzVals = [256]uint8{ - 8, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 7, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0} - // Bswap64 returns its input with byte order reversed // 0x0102030405060708 -> 0x0807060504030201 func Bswap64(x uint64) uint64 { - c8 := uint64(0xff00ff00ff00ff00) - a := (x & c8) >> 8 - b := (x &^ c8) << 8 + c8 := uint64(0x00ff00ff00ff00ff) + a := x >> 8 & c8 + b := (x & c8) << 8 x = a | b - c16 := uint64(0xffff0000ffff0000) - a = (x & c16) >> 16 - b = (x &^ c16) << 16 + c16 := uint64(0x0000ffff0000ffff) + a = x >> 16 & c16 + b = (x & c16) << 16 x = a | b - c32 := uint64(0xffffffff00000000) - a = (x & c32) >> 32 - b = (x &^ c32) << 32 + c32 := uint64(0x00000000ffffffff) + a = x >> 32 & c32 + b = (x & c32) << 32 x = a | b return x } @@ -93,13 +102,13 @@ func Bswap64(x uint64) uint64 { // Bswap32 returns its input with byte order reversed // 0x01020304 -> 0x04030201 func Bswap32(x uint32) uint32 { - c8 := uint32(0xff00ff00) - a := (x & c8) >> 8 - b := (x &^ c8) << 8 + c8 := uint32(0x00ff00ff) + a := x >> 8 & c8 + b := (x & c8) << 8 x = a | b - c16 := uint32(0xffff0000) - a = (x & c16) >> 16 - b = (x &^ c16) << 16 + c16 := uint32(0x0000ffff) + a = x >> 16 & c16 + b = (x & c16) << 16 x = a | b return x } diff --git a/src/runtime/internal/sys/intrinsics_test.go b/src/runtime/internal/sys/intrinsics_test.go new file mode 100644 index 0000000000..097631bc1e --- /dev/null +++ b/src/runtime/internal/sys/intrinsics_test.go @@ -0,0 +1,54 @@ +package sys_test + +import ( + "runtime/internal/sys" + "testing" +) + +func TestCtz64(t *testing.T) { + for i := uint(0); i <= 64; i++ { + x := uint64(5) << i + if got := sys.Ctz64(x); got != uint64(i) { + t.Errorf("Ctz64(%d)=%d, want %d", x, got, i) + } + } +} +func TestCtz32(t *testing.T) { + for i := uint(0); i <= 32; i++ { + x := uint32(5) << i + if got := sys.Ctz32(x); got != uint32(i) { + t.Errorf("Ctz32(%d)=%d, want %d", x, got, i) + } + } +} +func TestCtz16(t *testing.T) { + for i := uint(0); i <= 16; i++ { + x := uint16(5) << i + if got := sys.Ctz16(x); got != uint16(i) { + t.Errorf("Ctz16(%d)=%d, want %d", x, got, i) + } + } +} +func TestCtz8(t *testing.T) { + for i := uint(0); i <= 8; i++ { + x := uint8(5) << i + if got := sys.Ctz8(x); got != uint8(i) { + t.Errorf("Ctz8(%d)=%d, want %d", x, got, i) + } + } +} + +func TestBswap64(t *testing.T) { + x := uint64(0x1122334455667788) + y := sys.Bswap64(x) + if y != 0x8877665544332211 { + t.Errorf("Bswap(%x)=%x, want 0x8877665544332211", x, y) + } +} +func TestBswap32(t *testing.T) { + x := uint32(0x11223344) + y := sys.Bswap32(x) + if y != 0x44332211 { + t.Errorf("Bswap(%x)=%x, want 0x44332211", x, y) + } +} -- 2.48.1