Roughly 2x speedup for the internal bitLen function in arith.go. Added TestWordBitLen test.
Performance differences against the new version of
bitLen generic:
x86-64 Macbook pro (current tip):
benchmark old ns/op new ns/op delta
big.BenchmarkBitLen0 6 4 -37.40%
big.BenchmarkBitLen1 6 2 -51.79%
big.BenchmarkBitLen2 6 2 -65.04%
big.BenchmarkBitLen3 6 2 -66.10%
big.BenchmarkBitLen4 6 2 -60.96%
big.BenchmarkBitLen5 6 2 -55.80%
big.BenchmarkBitLen8 6 2 -56.19%
big.BenchmarkBitLen9 6 2 -64.73%
big.BenchmarkBitLen16 7 2 -68.84%
big.BenchmarkBitLen17 6 2 -67.11%
big.BenchmarkBitLen31 7 2 -61.57%
386 Intel Atom (current tip):
benchmark old ns/op new ns/op delta
big.BenchmarkBitLen0 23 20 -13.04%
big.BenchmarkBitLen1 23 20 -14.77%
big.BenchmarkBitLen2 24 20 -19.28%
big.BenchmarkBitLen3 25 20 -21.57%
big.BenchmarkBitLen4 24 20 -16.94%
big.BenchmarkBitLen5 25 20 -20.78%
big.BenchmarkBitLen8 24 20 -19.28%
big.BenchmarkBitLen9 25 20 -20.47%
big.BenchmarkBitLen16 26 20 -23.37%
big.BenchmarkBitLen17 26 20 -25.09%
big.BenchmarkBitLen31 32 20 -35.51%
ARM v5 SheevaPlug, previous weekly patched with bitLen:
benchmark old ns/op new ns/op delta
big.BenchmarkBitLen0 50 29 -41.73%
big.BenchmarkBitLen1 51 29 -42.75%
big.BenchmarkBitLen2 59 29 -50.08%
big.BenchmarkBitLen3 60 29 -50.75%
big.BenchmarkBitLen4 59 29 -50.08%
big.BenchmarkBitLen5 60 29 -50.75%
big.BenchmarkBitLen8 59 29 -50.08%
big.BenchmarkBitLen9 60 29 -50.75%
big.BenchmarkBitLen16 69 29 -57.35%
big.BenchmarkBitLen17 70 29 -57.89%
big.BenchmarkBitLen31 95 29 -69.07%
R=golang-dev, minux.ma, gri
CC=golang-dev
https://golang.org/cl/
5574054
}
// Length of x in bits.
-func bitLen(x Word) (n int) {
+func bitLen_g(x Word) (n int) {
for ; x >= 0x8000; x >>= 16 {
n += 16
}
RET
-// divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
+// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),7,$0
MOVL z+0(FP), DI
MOVL xn+12(FP), DX // r = xn
MOVL DX, r+32(FP)
RET
+
+// func bitLen(x Word) (n int)
+TEXT ·bitLen(SB),7,$0
+ BSRL x+0(FP), AX
+ JZ Z1
+ INCL AX
+ MOVL AX, n+4(FP)
+ RET
+
+Z1: MOVL $0, n+4(FP)
+ RET
RET
-// divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
+// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),7,$0
MOVQ z+0(FP), R10
MOVQ xn+16(FP), DX // r = xn
MOVQ DX, r+48(FP)
RET
+
+// func bitLen(x Word) (n int)
+TEXT ·bitLen(SB),7,$0
+ BSRQ x+0(FP), AX
+ JZ Z1
+ INCQ AX
+ MOVQ AX, n+8(FP)
+ RET
+
+Z1: MOVQ $0, n+8(FP)
+ RET
RET
-// divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
+// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),7,$0
// ARM has no multiword division, so use portable code.
B ·divWVW_g(SB)
MOVW R4, z1+8(FP)
MOVW R3, z0+12(FP)
RET
+
+// func bitLen(x Word) (n int)
+TEXT ·bitLen(SB),7,$0
+ MOVW x+0(FP), R0
+ WORD $0xe16f0f10 // CLZ R0, R0 (count leading zeros)
+ MOVW $32, R1
+ SUB.S R0, R1
+ MOVW R1, n+4(FP)
+ RET
func mulAddVWW(z, x []Word, y, r Word) (c Word)
func addMulVVW(z, x []Word, y Word) (c Word)
func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
+func bitLen(x Word) (n int)
}
}
+func TestWordBitLen(t *testing.T) {
+ // Test every possible output of bitLen with the high bit set
+ // and then with all bits below max set
+ z := bitLen(0)
+ if z != 0 {
+ t.Errorf("0 got %d want 0", z)
+ }
+ x := Word(1) // Will be ...00010000...
+ y := Word(1) // Will be ...00011111...
+ for i := 1; i <= _W; i++ {
+ z = bitLen(x)
+ if z != i {
+ t.Errorf("%x got %d want %d", x, z, i)
+ }
+ z = bitLen(y)
+ if z != i {
+ t.Errorf("%x got %d want %d", y, z, i)
+ }
+ x <<= 1
+ y = (y << 1) | 0x1
+ }
+}
+
// runs b.N iterations of bitLen called on a Word containing (1 << nbits)-1.
func benchmarkBitLenN(b *testing.B, nbits uint) {
testword := Word((uint64(1) << nbits) - 1)