import (
"errors"
"io"
+ "math"
"math/rand"
+ "sync"
)
// An unsigned integer x of the form
// special cases
switch {
- case b < 2 || b > 256:
+ case b < 2 || MaxBase < b:
panic("illegal base")
case len(x) == 0:
return string(charset[0])
}
// allocate buffer for conversion
- i := x.bitLen()/log2(b) + 1 // +1: round up
+ i := int(float64(x.bitLen())/math.Log2(float64(b))) + 1 // off by one at most
s := make([]byte, i)
- // special case: power of two bases can avoid divisions completely
+ // convert power of two and non power of two bases separately
if b == b&-b {
// shift is base-b digit size in bits
shift := uint(trailingZeroBits(b)) // shift > 0 because b >= 2
w >>= shift
nbits -= shift
}
+ } else {
+ // determine "big base" as in 10^19 for 19 decimal digits in a 64 bit Word
+ bb := Word(1) // big base is b**ndigits
+ ndigits := 0 // number of base b digits
+ for max := Word(_M / b); bb <= max; bb *= b {
+ ndigits++ // maximize ndigits where bb = b**ndigits, bb <= _M
+ }
- return string(s[i:])
- }
+ // construct table of successive squares of bb*leafSize to use in subdivisions
+ table := divisors(len(x), b, ndigits, bb)
- // general case: extract groups of digits by multiprecision division
+ // preserve x, create local copy for use in divisions
+ q := nat(nil).set(x)
- // maximize ndigits where b**ndigits < 2^_W; bb (big base) is b**ndigits
- bb := Word(1)
- ndigits := 0
- for max := Word(_M / b); bb <= max; bb *= b {
- ndigits++
+ // convert q to string s in base b with index of MSD indicated by return value
+ i = q.convertWords(0, i, s, charset, b, ndigits, bb, table)
}
- // preserve x, create local copy for use in repeated divisions
- q := nat(nil).set(x)
- var r Word
+ return string(s[i:])
+}
+
+// Convert words of q to base b digits in s directly using iterated nat/Word divison to extract
+// low-order Words and indirectly by recursive subdivision and nat/nat division by tabulated
+// divisors.
+//
+// The direct method processes n Words by n divW() calls, each of which visits every Word in the
+// incrementally shortened q for a total of n + (n-1) + (n-2) ... + 2 + 1, or n(n+1)/2 divW()'s.
+// Indirect conversion divides q by its approximate square root, yielding two parts, each half
+// the size of q. Using the direct method on both halves means 2 * (n/2)(n/2 + 1)/2 divW()'s plus
+// the expensive long div(). Asymptotically, the ratio is favorable at 1/2 the divW()'s, and is
+// made better by splitting the subblocks recursively. Best is to split blocks until one more
+// split would take longer (because of the nat/nat div()) than the twice as many divW()'s of the
+// direct approach. This threshold is represented by leafSize. Benchmarking of leafSize in the
+// range 2..64 shows that values of 8 and 16 work well, with a 4x speedup at medium lengths and
+// ~30x for 20000 digits. Use nat_test.go's BenchmarkLeafSize tests to optimize leafSize for
+// specfic hardware.
+//
+// lo and hi index character array s. conversion starts with the LSD at hi and moves down toward
+// the MSD, which will be at s[0] or s[1]. lo == 0 signals span includes the most significant word.
+//
+func (q nat) convertWords(lo, hi int, s []byte, charset string, b Word, ndigits int, bb Word, table []divisor) int {
+ // indirect conversion: split larger blocks to reduce quadratic expense of iterated nat/W division
+ if leafSize > 0 && len(q) > leafSize && table != nil {
+ var r nat
+ index := len(table) - 1
+ for len(q) > leafSize {
+ // find divisor close to sqrt(q) if possible, but in any case < q
+ maxLength := q.bitLen() // ~= log2 q, or at of least largest possible q of this bit length
+ minLength := maxLength >> 1 // ~= log2 sqrt(q)
+ for index > 0 && table[index-1].nbits > minLength {
+ index-- // desired
+ }
+ if table[index].nbits >= maxLength && table[index].bbb.cmp(q) >= 0 {
+ index--
+ if index < 0 {
+ panic("internal inconsistency")
+ }
+ }
- // convert
- if b == 10 { // hard-coding for 10 here speeds this up by 1.25x
+ // split q into the two digit number (q'*bbb + r) to form independent subblocks
+ q, r = q.div(r, q, table[index].bbb)
+
+ // convert subblocks and collect results in s[lo:partition] and s[partition:hi]
+ partition := hi - table[index].ndigits
+ r.convertWords(partition, hi, s, charset, b, ndigits, bb, table[0:index])
+ hi = partition // i.e., q.convertWords(lo, partition, s, charset, b, ndigits, bb, table[0:index+1])
+ }
+ } // having split any large blocks now process the remaining small block
+
+ // direct conversion: process smaller blocks monolithically to avoid overhead of nat/nat division
+ var r Word
+ if b == 10 { // hard-coding for 10 here speeds this up by 1.25x (allows mod as mul vs div)
for len(q) > 0 {
// extract least significant, base bb "digit"
- q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
- if len(q) == 0 {
+ q, r = q.divW(q, bb)
+ if lo == 0 && len(q) == 0 {
// skip leading zeros in most-significant group of digits
for j := 0; j < ndigits && r != 0; j++ {
- i--
- s[i] = charset[r%10]
- r /= 10
+ hi--
+ t := r / 10
+ s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
+ r = t
}
} else {
- for j := 0; j < ndigits; j++ {
- i--
- s[i] = charset[r%10]
- r /= 10
+ for j := 0; j < ndigits && hi > lo; j++ {
+ hi--
+ t := r / 10
+ s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
+ r = t
}
}
}
} else {
for len(q) > 0 {
// extract least significant group of digits
- q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
- if len(q) == 0 {
+ q, r = q.divW(q, bb)
+ if lo == 0 && len(q) == 0 {
// skip leading zeros in most-significant group of digits
for j := 0; j < ndigits && r != 0; j++ {
- i--
- s[i] = charset[r%b]
- r /= b
+ hi--
+ s[hi] = charset[r%b]
+ r = r / b
}
} else {
- for j := 0; j < ndigits; j++ {
- i--
- s[i] = charset[r%b]
- r /= b
+ for j := 0; j < ndigits && hi > lo; j++ {
+ hi--
+ s[hi] = charset[r%b]
+ r = r / b
}
}
}
}
- return string(s[i:])
+ // prepend high-order zeroes when q has been normalized to a short number of Words.
+ // however, do not prepend zeroes when converting the most dignificant digits.
+ if lo != 0 { // if not MSD
+ zero := charset[0]
+ for hi > lo { // while need more leading zeroes
+ hi--
+ s[hi] = zero
+ }
+ }
+
+ // return index of most significant output digit in s[] (stored in lowest index)
+ return hi
+}
+
+// Split blocks greater than leafSize Words (or set to 0 to disable indirect conversion)
+// Benchmark and configure leafSize using: gotest -test.bench="Leaf"
+// 8 and 16 effective on 3.0 GHz Xeon "Clovertown" CPU (128 byte cache lines)
+// 8 and 16 effective on 2.66 GHz Core 2 Duo "Penryn" CPU
+var leafSize int = 8 // number of Word-size binary values treat as a monolithic block
+
+type divisor struct {
+ bbb nat // divisor
+ nbits int // bit length of divisor (discounting leading zeroes) ~= log2(bbb)
+ ndigits int // digit length of divisor in terms of output base digits
+}
+
+const maxCache = 64 // maximum number of divisors in a single table
+var cacheBase10 [maxCache]divisor // cached divisors for base 10
+var cacheLock sync.Mutex // defense against concurrent table extensions
+
+// construct table of powers of bb*leafSize to use in subdivisions
+func divisors(m int, b Word, ndigits int, bb Word) []divisor {
+ // only build table when indirect conversion is enabled and x is large
+ if leafSize == 0 || m <= leafSize {
+ return nil
+ }
+
+ // determine k where (bb**leafSize)**(2**k) >= sqrt(x)
+ k := 1
+ for words := leafSize; words < m>>1 && k < maxCache; words <<= 1 {
+ k++
+ }
+
+ // create new table of divisors or extend and reuse existing table as appropriate
+ var cached bool
+ var table []divisor
+ switch b {
+ case 10:
+ table = cacheBase10[0:k] // reuse old table for this conversion
+ cached = true
+ default:
+ table = make([]divisor, k) // new table for this conversion
+ }
+
+ // extend table
+ if table[k-1].ndigits == 0 {
+ if cached {
+ cacheLock.Lock() // begin critical section
+ }
+
+ var i int
+ var larger nat
+ for i < k && table[i].ndigits != 0 { // skip existing entries
+ i++
+ }
+ for ; i < k; i++ { // add new entries
+ if i == 0 {
+ table[i].bbb = nat(nil).expWW(bb, Word(leafSize))
+ table[i].ndigits = ndigits * leafSize
+ } else {
+ table[i].bbb = nat(nil).mul(table[i-1].bbb, table[i-1].bbb)
+ table[i].ndigits = 2 * table[i-1].ndigits
+ }
+
+ // optimization: exploit aggregated extra bits in macro blocks
+ larger = nat(nil).set(table[i].bbb)
+ for mulAddVWW(larger, larger, b, 0) == 0 {
+ table[i].bbb = table[i].bbb.set(larger)
+ table[i].ndigits++
+ }
+
+ table[i].nbits = table[i].bbb.bitLen()
+ }
+
+ if cached {
+ cacheLock.Unlock() // end critical section
+ }
+ }
+
+ return table
}
const deBruijn32 = 0x077CB531
}
}
- return z
+ return z.norm()
+}
+
+// calculate x**y for Word arguments y and y
+func (z nat) expWW(x, y Word) nat {
+ return z.expNN(nat(nil).setWord(x), nat(nil).setWord(y), nil)
}
// probablyPrime performs reps Miller-Rabin tests to check whether n is prime.
}
}
-const (
- // 314**271
- // base 2: 2249 digits
- // base 8: 751 digits
- // base 10: 678 digits
- // base 16: 563 digits
- shortBase = 314
- shortExponent = 271
-
- // 3141**2178
- // base 2: 31577 digits
- // base 8: 10527 digits
- // base 10: 9507 digits
- // base 16: 7895 digits
- mediumBase = 3141
- mediumExponent = 2718
-
- // 3141**2178
- // base 2: 406078 digits
- // base 8: 135360 digits
- // base 10: 122243 digits
- // base 16: 101521 digits
- longBase = 31415
- longExponent = 27182
-)
-
-func BenchmarkScanShort2(b *testing.B) {
- ScanHelper(b, 2, shortBase, shortExponent)
-}
-
-func BenchmarkScanShort8(b *testing.B) {
- ScanHelper(b, 8, shortBase, shortExponent)
-}
-
-func BenchmarkScanSort10(b *testing.B) {
- ScanHelper(b, 10, shortBase, shortExponent)
-}
-
-func BenchmarkScanShort16(b *testing.B) {
- ScanHelper(b, 16, shortBase, shortExponent)
-}
-
-func BenchmarkScanMedium2(b *testing.B) {
- ScanHelper(b, 2, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium8(b *testing.B) {
- ScanHelper(b, 8, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium10(b *testing.B) {
- ScanHelper(b, 10, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium16(b *testing.B) {
- ScanHelper(b, 16, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanLong2(b *testing.B) {
- ScanHelper(b, 2, longBase, longExponent)
-}
-
-func BenchmarkScanLong8(b *testing.B) {
- ScanHelper(b, 8, longBase, longExponent)
-}
-
-func BenchmarkScanLong10(b *testing.B) {
- ScanHelper(b, 10, longBase, longExponent)
-}
-
-func BenchmarkScanLong16(b *testing.B) {
- ScanHelper(b, 16, longBase, longExponent)
-}
-
-func ScanHelper(b *testing.B, base int, xv, yv Word) {
+func BenchmarkScan10Base2(b *testing.B) { ScanHelper(b, 2, 10, 10) }
+func BenchmarkScan100Base2(b *testing.B) { ScanHelper(b, 2, 10, 100) }
+func BenchmarkScan1000Base2(b *testing.B) { ScanHelper(b, 2, 10, 1000) }
+func BenchmarkScan10000Base2(b *testing.B) { ScanHelper(b, 2, 10, 10000) }
+func BenchmarkScan100000Base2(b *testing.B) { ScanHelper(b, 2, 10, 100000) }
+
+func BenchmarkScan10Base8(b *testing.B) { ScanHelper(b, 8, 10, 10) }
+func BenchmarkScan100Base8(b *testing.B) { ScanHelper(b, 8, 10, 100) }
+func BenchmarkScan1000Base8(b *testing.B) { ScanHelper(b, 8, 10, 1000) }
+func BenchmarkScan10000Base8(b *testing.B) { ScanHelper(b, 8, 10, 10000) }
+func BenchmarkScan100000Base8(b *testing.B) { ScanHelper(b, 8, 10, 100000) }
+
+func BenchmarkScan10Base10(b *testing.B) { ScanHelper(b, 10, 10, 10) }
+func BenchmarkScan100Base10(b *testing.B) { ScanHelper(b, 10, 10, 100) }
+func BenchmarkScan1000Base10(b *testing.B) { ScanHelper(b, 10, 10, 1000) }
+func BenchmarkScan10000Base10(b *testing.B) { ScanHelper(b, 10, 10, 10000) }
+func BenchmarkScan100000Base10(b *testing.B) { ScanHelper(b, 10, 10, 100000) }
+
+func BenchmarkScan10Base16(b *testing.B) { ScanHelper(b, 16, 10, 10) }
+func BenchmarkScan100Base16(b *testing.B) { ScanHelper(b, 16, 10, 100) }
+func BenchmarkScan1000Base16(b *testing.B) { ScanHelper(b, 16, 10, 1000) }
+func BenchmarkScan10000Base16(b *testing.B) { ScanHelper(b, 16, 10, 10000) }
+func BenchmarkScan100000Base16(b *testing.B) { ScanHelper(b, 16, 10, 100000) }
+
+func ScanHelper(b *testing.B, base int, x, y Word) {
b.StopTimer()
- var x, y, z nat
- x = x.setWord(xv)
- y = y.setWord(yv)
- z = z.expNN(x, y, nil)
+ var z nat
+ z = z.expWW(x, y)
var s string
s = z.string(lowercaseDigits[0:base])
b.StartTimer()
for i := 0; i < b.N; i++ {
- x.scan(strings.NewReader(s), base)
+ z.scan(strings.NewReader(s), base)
}
}
-func BenchmarkStringShort2(b *testing.B) {
- StringHelper(b, 2, shortBase, shortExponent)
-}
+func BenchmarkString10Base2(b *testing.B) { StringHelper(b, 2, 10, 10) }
+func BenchmarkString100Base2(b *testing.B) { StringHelper(b, 2, 10, 100) }
+func BenchmarkString1000Base2(b *testing.B) { StringHelper(b, 2, 10, 1000) }
+func BenchmarkString10000Base2(b *testing.B) { StringHelper(b, 2, 10, 10000) }
+func BenchmarkString100000Base2(b *testing.B) { StringHelper(b, 2, 10, 100000) }
-func BenchmarkStringShort8(b *testing.B) {
- StringHelper(b, 8, shortBase, shortExponent)
-}
+func BenchmarkString10Base8(b *testing.B) { StringHelper(b, 8, 10, 10) }
+func BenchmarkString100Base8(b *testing.B) { StringHelper(b, 8, 10, 100) }
+func BenchmarkString1000Base8(b *testing.B) { StringHelper(b, 8, 10, 1000) }
+func BenchmarkString10000Base8(b *testing.B) { StringHelper(b, 8, 10, 10000) }
+func BenchmarkString100000Base8(b *testing.B) { StringHelper(b, 8, 10, 100000) }
-func BenchmarkStringShort10(b *testing.B) {
- StringHelper(b, 10, shortBase, shortExponent)
-}
-
-func BenchmarkStringShort16(b *testing.B) {
- StringHelper(b, 16, shortBase, shortExponent)
-}
+func BenchmarkString10Base10(b *testing.B) { StringHelper(b, 10, 10, 10) }
+func BenchmarkString100Base10(b *testing.B) { StringHelper(b, 10, 10, 100) }
+func BenchmarkString1000Base10(b *testing.B) { StringHelper(b, 10, 10, 1000) }
+func BenchmarkString10000Base10(b *testing.B) { StringHelper(b, 10, 10, 10000) }
+func BenchmarkString100000Base10(b *testing.B) { StringHelper(b, 10, 10, 100000) }
-func BenchmarkStringMedium2(b *testing.B) {
- StringHelper(b, 2, mediumBase, mediumExponent)
-}
+func BenchmarkString10Base16(b *testing.B) { StringHelper(b, 16, 10, 10) }
+func BenchmarkString100Base16(b *testing.B) { StringHelper(b, 16, 10, 100) }
+func BenchmarkString1000Base16(b *testing.B) { StringHelper(b, 16, 10, 1000) }
+func BenchmarkString10000Base16(b *testing.B) { StringHelper(b, 16, 10, 10000) }
+func BenchmarkString100000Base16(b *testing.B) { StringHelper(b, 16, 10, 100000) }
-func BenchmarkStringMedium8(b *testing.B) {
- StringHelper(b, 8, mediumBase, mediumExponent)
-}
+func StringHelper(b *testing.B, base int, x, y Word) {
+ b.StopTimer()
+ var z nat
+ z = z.expWW(x, y)
+ z.string(lowercaseDigits[0:base]) // warm divisor cache
+ b.StartTimer()
-func BenchmarkStringMedium10(b *testing.B) {
- StringHelper(b, 10, mediumBase, mediumExponent)
+ for i := 0; i < b.N; i++ {
+ _ = z.string(lowercaseDigits[0:base])
+ }
}
-func BenchmarkStringMedium16(b *testing.B) {
- StringHelper(b, 16, mediumBase, mediumExponent)
-}
+func BenchmarkLeafSize0(b *testing.B) { LeafSizeHelper(b, 10, 0) } // test without splitting
+func BenchmarkLeafSize1(b *testing.B) { LeafSizeHelper(b, 10, 1) }
+func BenchmarkLeafSize2(b *testing.B) { LeafSizeHelper(b, 10, 2) }
+func BenchmarkLeafSize3(b *testing.B) { LeafSizeHelper(b, 10, 3) }
+func BenchmarkLeafSize4(b *testing.B) { LeafSizeHelper(b, 10, 4) }
+func BenchmarkLeafSize5(b *testing.B) { LeafSizeHelper(b, 10, 5) }
+func BenchmarkLeafSize6(b *testing.B) { LeafSizeHelper(b, 10, 6) }
+func BenchmarkLeafSize7(b *testing.B) { LeafSizeHelper(b, 10, 7) }
+func BenchmarkLeafSize8(b *testing.B) { LeafSizeHelper(b, 10, 8) }
+func BenchmarkLeafSize9(b *testing.B) { LeafSizeHelper(b, 10, 9) }
+func BenchmarkLeafSize10(b *testing.B) { LeafSizeHelper(b, 10, 10) }
+func BenchmarkLeafSize11(b *testing.B) { LeafSizeHelper(b, 10, 11) }
+func BenchmarkLeafSize12(b *testing.B) { LeafSizeHelper(b, 10, 12) }
+func BenchmarkLeafSize13(b *testing.B) { LeafSizeHelper(b, 10, 13) }
+func BenchmarkLeafSize14(b *testing.B) { LeafSizeHelper(b, 10, 14) }
+func BenchmarkLeafSize15(b *testing.B) { LeafSizeHelper(b, 10, 15) }
+func BenchmarkLeafSize16(b *testing.B) { LeafSizeHelper(b, 10, 16) }
+func BenchmarkLeafSize32(b *testing.B) { LeafSizeHelper(b, 10, 32) } // try some large lengths
+func BenchmarkLeafSize64(b *testing.B) { LeafSizeHelper(b, 10, 64) }
+
+func LeafSizeHelper(b *testing.B, base Word, size int) {
+ b.StopTimer()
+ originalLeafSize := leafSize
+ resetTable(cacheBase10[:])
+ leafSize = size
+ b.StartTimer()
-func BenchmarkStringLong2(b *testing.B) {
- StringHelper(b, 2, longBase, longExponent)
-}
+ for d := 1; d <= 10000; d *= 10 {
+ b.StopTimer()
+ var z nat
+ z = z.expWW(base, Word(d)) // build target number
+ _ = z.string(lowercaseDigits[0:base]) // warm divisor cache
+ b.StartTimer()
-func BenchmarkStringLong8(b *testing.B) {
- StringHelper(b, 8, longBase, longExponent)
-}
+ for i := 0; i < b.N; i++ {
+ _ = z.string(lowercaseDigits[0:base])
+ }
+ }
-func BenchmarkStringLong10(b *testing.B) {
- StringHelper(b, 10, longBase, longExponent)
+ b.StopTimer()
+ resetTable(cacheBase10[:])
+ leafSize = originalLeafSize
+ b.StartTimer()
}
-func BenchmarkStringLong16(b *testing.B) {
- StringHelper(b, 16, longBase, longExponent)
+func resetTable(table []divisor) {
+ if table != nil && table[0].bbb != nil {
+ for i := 0; i < len(table); i++ {
+ table[i].bbb = nil
+ table[i].nbits = 0
+ table[i].ndigits = 0
+ }
+ }
}
-func StringHelper(b *testing.B, base int, xv, yv Word) {
- b.StopTimer()
- var x, y, z nat
- x = x.setWord(xv)
- y = y.setWord(yv)
- z = z.expNN(x, y, nil)
- b.StartTimer()
-
- for i := 0; i < b.N; i++ {
- z.string(lowercaseDigits[0:base])
+func TestStringPowers(t *testing.T) {
+ var b, p Word
+ for b = 2; b <= 16; b++ {
+ for p = 0; p <= 512; p++ {
+ x := nat(nil).expWW(b, p)
+ xs := x.string(lowercaseDigits[0:b])
+ xs2 := toString(x, lowercaseDigits[0:b])
+ if xs != xs2 {
+ t.Errorf("failed at %d ** %d in base %d: %s != %s", b, p, b, xs, xs2)
+ }
+ }
}
}