From: Russ Cox Date: Sat, 1 Nov 2025 13:41:40 +0000 (-0400) Subject: internal/strconv: extract fixed-precision ftoa from ftoaryu.go X-Git-Tag: go1.26rc1~384 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=34fec512ce;p=gostls13.git internal/strconv: extract fixed-precision ftoa from ftoaryu.go The fixed-precision ftoa algorithm is not actually documented in the Ryū paper, and it is fairly straightforward: multiply by a power of 10 to get an integer that contains the digits we need. There is also no need for separate float32 and float64 implementations. This CL implements a new fixedFtoa, separate from Ryū. The overall algorithm is the same, but the new code is simpler, faster, and better documented. Now ftoaryu.go is only about shortest-output formatting, so if and when yet another algorithm comes along, it will be clearer what should be replaced (all of ftoaryu.go) and what should not (all of ftoafixed.go). benchmark \ host linux-arm64 local linux-amd64 s7 linux-386 s7:GOARCH=386 vs base vs base vs base vs base vs base vs base AppendFloat/Decimal -0.18% ~ ~ -0.68% +0.49% -0.79% AppendFloat/Float +0.09% ~ +1.50% +0.84% -0.37% -0.69% AppendFloat/Exp -0.51% ~ ~ +1.20% -1.27% -1.01% AppendFloat/NegExp -1.01% ~ +3.43% +1.35% -2.33% ~ AppendFloat/LongExp -1.22% +0.77% ~ ~ -1.48% ~ AppendFloat/Big -2.07% ~ -2.07% -1.97% -2.89% -2.93% AppendFloat/BinaryExp -0.28% +1.06% ~ +1.35% -0.64% -1.64% AppendFloat/32Integer ~ ~ ~ -0.79% ~ -0.66% AppendFloat/32ExactFraction -0.50% ~ +5.69% ~ -1.24% +0.69% AppendFloat/32Point ~ -1.19% +2.59% +1.03% -1.37% +0.80% AppendFloat/32Exp -3.39% -2.79% -8.36% -0.94% -5.72% -5.92% AppendFloat/32NegExp -0.63% ~ ~ +0.98% -1.34% -0.73% AppendFloat/32Shortest -1.00% +1.36% +2.94% ~ ~ ~ AppendFloat/32Fixed8Hard -5.91% -12.45% -6.62% ~ +18.46% +11.61% AppendFloat/32Fixed9Hard -6.53% -11.35% -6.01% -0.97% -18.31% -9.16% AppendFloat/64Fixed1 -13.84% -16.90% -13.13% -10.71% -24.52% -18.94% AppendFloat/64Fixed2 -11.12% -16.97% -12.13% -9.88% -22.73% -15.48% AppendFloat/64Fixed2.5 -21.98% -20.75% -19.08% -14.74% -28.11% -24.92% AppendFloat/64Fixed3 -11.53% -16.21% -10.75% -7.53% -23.11% -15.78% AppendFloat/64Fixed4 -12.89% -12.36% -11.07% -9.79% -14.51% -13.44% AppendFloat/64Fixed5Hard -47.62% -38.59% -40.83% -37.06% -60.51% -55.29% AppendFloat/64Fixed12 -7.40% ~ -8.56% -4.31% -13.82% -8.61% AppendFloat/64Fixed16 -9.10% -8.95% -6.92% -3.92% -12.99% -9.03% AppendFloat/64Fixed12Hard -9.14% -5.24% -6.23% -4.82% -13.58% -8.99% AppendFloat/64Fixed17Hard -6.80% ~ -4.03% -2.84% -19.81% -10.27% AppendFloat/64Fixed18Hard -0.12% ~ ~ ~ ~ ~ AppendFloat/64FixedF1 ~ ~ ~ ~ -0.40% +2.72% AppendFloat/64FixedF2 -0.18% ~ -1.98% -0.95% ~ +1.25% AppendFloat/64FixedF3 -0.29% ~ ~ ~ ~ +1.22% AppendFloat/Slowpath64 -1.16% ~ ~ ~ ~ -2.16% AppendFloat/SlowpathDenormal64 -1.09% ~ ~ -0.88% -0.83% ~ host: linux-arm64 goos: linux goarch: arm64 pkg: internal/strconv cpu: unknown │ 14b7e09f493 │ f9bf7fcb8e2 │ │ sec/op │ sec/op vs base │ AppendFloat/Decimal-8 60.35n ± 0% 60.24n ± 0% -0.18% (p=0.000 n=20) AppendFloat/Float-8 88.83n ± 0% 88.91n ± 0% +0.09% (p=0.000 n=20) AppendFloat/Exp-8 93.55n ± 0% 93.06n ± 0% -0.51% (p=0.000 n=20) AppendFloat/NegExp-8 94.01n ± 0% 93.06n ± 0% -1.01% (p=0.000 n=20) AppendFloat/LongExp-8 101.00n ± 0% 99.77n ± 0% -1.22% (p=0.000 n=20) AppendFloat/Big-8 106.1n ± 0% 103.9n ± 0% -2.07% (p=0.000 n=20) AppendFloat/BinaryExp-8 47.48n ± 0% 47.35n ± 0% -0.28% (p=0.000 n=20) AppendFloat/32Integer-8 60.45n ± 0% 60.43n ± 0% ~ (p=0.150 n=20) AppendFloat/32ExactFraction-8 86.65n ± 0% 86.22n ± 0% -0.50% (p=0.000 n=20) AppendFloat/32Point-8 83.26n ± 0% 83.21n ± 0% ~ (p=0.046 n=20) AppendFloat/32Exp-8 92.55n ± 0% 89.42n ± 0% -3.39% (p=0.000 n=20) AppendFloat/32NegExp-8 87.89n ± 0% 87.34n ± 0% -0.63% (p=0.000 n=20) AppendFloat/32Shortest-8 77.05n ± 0% 76.28n ± 0% -1.00% (p=0.000 n=20) AppendFloat/32Fixed8Hard-8 55.73n ± 0% 52.44n ± 0% -5.91% (p=0.000 n=20) AppendFloat/32Fixed9Hard-8 64.80n ± 0% 60.57n ± 0% -6.53% (p=0.000 n=20) AppendFloat/64Fixed1-8 53.72n ± 0% 46.29n ± 0% -13.84% (p=0.000 n=20) AppendFloat/64Fixed2-8 52.64n ± 0% 46.79n ± 0% -11.12% (p=0.000 n=20) AppendFloat/64Fixed2.5-8 56.01n ± 0% 43.70n ± 0% -21.98% (p=0.000 n=20) AppendFloat/64Fixed3-8 53.38n ± 0% 47.23n ± 0% -11.53% (p=0.000 n=20) AppendFloat/64Fixed4-8 50.62n ± 0% 44.10n ± 0% -12.89% (p=0.000 n=20) AppendFloat/64Fixed5Hard-8 98.94n ± 0% 51.82n ± 0% -47.62% (p=0.000 n=20) AppendFloat/64Fixed12-8 84.70n ± 0% 78.44n ± 0% -7.40% (p=0.000 n=20) AppendFloat/64Fixed16-8 71.68n ± 0% 65.16n ± 0% -9.10% (p=0.000 n=20) AppendFloat/64Fixed12Hard-8 68.41n ± 0% 62.16n ± 0% -9.14% (p=0.000 n=20) AppendFloat/64Fixed17Hard-8 79.31n ± 0% 73.92n ± 0% -6.80% (p=0.000 n=20) AppendFloat/64Fixed18Hard-8 4.290µ ± 0% 4.285µ ± 0% -0.12% (p=0.000 n=20) AppendFloat/64FixedF1-8 216.0n ± 0% 216.1n ± 0% ~ (p=0.090 n=20) AppendFloat/64FixedF2-8 228.2n ± 0% 227.8n ± 0% -0.18% (p=0.000 n=20) AppendFloat/64FixedF3-8 208.8n ± 0% 208.2n ± 0% -0.29% (p=0.000 n=20) AppendFloat/Slowpath64-8 98.56n ± 0% 97.42n ± 0% -1.16% (p=0.000 n=20) AppendFloat/SlowpathDenormal64-8 95.81n ± 0% 94.77n ± 0% -1.09% (p=0.000 n=20) geomean 93.81n 87.87n -6.33% host: local goos: darwin cpu: Apple M3 Pro │ 14b7e09f493 │ f9bf7fcb8e2 │ │ sec/op │ sec/op vs base │ AppendFloat/Decimal-12 21.14n ± 0% 21.15n ± 0% ~ (p=0.963 n=20) AppendFloat/Float-12 32.48n ± 1% 32.43n ± 0% ~ (p=0.358 n=20) AppendFloat/Exp-12 31.85n ± 0% 31.94n ± 1% ~ (p=0.634 n=20) AppendFloat/NegExp-12 31.75n ± 0% 32.04n ± 0% ~ (p=0.004 n=20) AppendFloat/LongExp-12 33.55n ± 0% 33.81n ± 0% +0.77% (p=0.000 n=20) AppendFloat/Big-12 35.62n ± 1% 35.73n ± 1% ~ (p=0.888 n=20) AppendFloat/BinaryExp-12 19.26n ± 0% 19.46n ± 1% +1.06% (p=0.000 n=20) AppendFloat/32Integer-12 21.41n ± 0% 21.46n ± 1% ~ (p=0.733 n=20) AppendFloat/32ExactFraction-12 31.23n ± 1% 31.30n ± 1% ~ (p=0.857 n=20) AppendFloat/32Point-12 31.39n ± 1% 31.02n ± 0% -1.19% (p=0.000 n=20) AppendFloat/32Exp-12 32.42n ± 1% 31.52n ± 1% -2.79% (p=0.000 n=20) AppendFloat/32NegExp-12 30.66n ± 1% 30.66n ± 1% ~ (p=0.380 n=20) AppendFloat/32Shortest-12 26.88n ± 1% 27.25n ± 1% +1.36% (p=0.000 n=20) AppendFloat/32Fixed8Hard-12 19.52n ± 0% 17.09n ± 1% -12.45% (p=0.000 n=20) AppendFloat/32Fixed9Hard-12 21.55n ± 2% 19.11n ± 1% -11.35% (p=0.000 n=20) AppendFloat/64Fixed1-12 18.64n ± 0% 15.49n ± 0% -16.90% (p=0.000 n=20) AppendFloat/64Fixed2-12 18.65n ± 0% 15.49n ± 0% -16.97% (p=0.000 n=20) AppendFloat/64Fixed2.5-12 19.23n ± 1% 15.24n ± 0% -20.75% (p=0.000 n=20) AppendFloat/64Fixed3-12 18.61n ± 0% 15.59n ± 1% -16.21% (p=0.000 n=20) AppendFloat/64Fixed4-12 17.55n ± 1% 15.38n ± 0% -12.36% (p=0.000 n=20) AppendFloat/64Fixed5Hard-12 29.27n ± 1% 17.97n ± 0% -38.59% (p=0.000 n=20) AppendFloat/64Fixed12-12 28.26n ± 1% 28.17n ± 10% ~ (p=0.941 n=20) AppendFloat/64Fixed16-12 23.56n ± 0% 21.46n ± 0% -8.95% (p=0.000 n=20) AppendFloat/64Fixed12Hard-12 21.85n ± 2% 20.70n ± 1% -5.24% (p=0.000 n=20) AppendFloat/64Fixed17Hard-12 26.91n ± 1% 27.10n ± 0% ~ (p=0.059 n=20) AppendFloat/64Fixed18Hard-12 2.197µ ± 1% 2.169µ ± 1% ~ (p=0.013 n=20) AppendFloat/64FixedF1-12 103.7n ± 1% 103.3n ± 0% ~ (p=0.035 n=20) AppendFloat/64FixedF2-12 114.8n ± 1% 114.1n ± 1% ~ (p=0.234 n=20) AppendFloat/64FixedF3-12 107.8n ± 1% 107.1n ± 1% ~ (p=0.180 n=20) AppendFloat/Slowpath64-12 32.05n ± 1% 32.00n ± 0% ~ (p=0.952 n=20) AppendFloat/SlowpathDenormal64-12 29.98n ± 1% 30.20n ± 0% ~ (p=0.004 n=20) geomean 33.83n 31.91n -5.68% host: linux-amd64 goos: linux goarch: amd64 cpu: Intel(R) Xeon(R) CPU @ 2.30GHz │ 14b7e09f493 │ f9bf7fcb8e2 │ │ sec/op │ sec/op vs base │ AppendFloat/Decimal-16 64.00n ± 1% 63.67n ± 1% ~ (p=0.784 n=20) AppendFloat/Float-16 95.99n ± 1% 97.42n ± 1% +1.50% (p=0.000 n=20) AppendFloat/Exp-16 97.59n ± 1% 97.72n ± 1% ~ (p=0.984 n=20) AppendFloat/NegExp-16 97.80n ± 1% 101.15n ± 1% +3.43% (p=0.000 n=20) AppendFloat/LongExp-16 103.1n ± 1% 104.5n ± 1% ~ (p=0.006 n=20) AppendFloat/Big-16 110.8n ± 1% 108.5n ± 1% -2.07% (p=0.000 n=20) AppendFloat/BinaryExp-16 47.82n ± 1% 47.33n ± 1% ~ (p=0.007 n=20) AppendFloat/32Integer-16 63.65n ± 1% 63.51n ± 0% ~ (p=0.560 n=20) AppendFloat/32ExactFraction-16 91.81n ± 1% 97.03n ± 1% +5.69% (p=0.000 n=20) AppendFloat/32Point-16 89.84n ± 1% 92.16n ± 1% +2.59% (p=0.000 n=20) AppendFloat/32Exp-16 103.80n ± 1% 95.12n ± 1% -8.36% (p=0.000 n=20) AppendFloat/32NegExp-16 93.70n ± 1% 94.87n ± 1% ~ (p=0.003 n=20) AppendFloat/32Shortest-16 83.98n ± 1% 86.45n ± 1% +2.94% (p=0.000 n=20) AppendFloat/32Fixed8Hard-16 61.91n ± 1% 57.81n ± 1% -6.62% (p=0.000 n=20) AppendFloat/32Fixed9Hard-16 71.08n ± 0% 66.81n ± 1% -6.01% (p=0.000 n=20) AppendFloat/64Fixed1-16 59.27n ± 2% 51.49n ± 1% -13.13% (p=0.000 n=20) AppendFloat/64Fixed2-16 57.89n ± 1% 50.87n ± 1% -12.13% (p=0.000 n=20) AppendFloat/64Fixed2.5-16 61.04n ± 1% 49.40n ± 1% -19.08% (p=0.000 n=20) AppendFloat/64Fixed3-16 58.42n ± 1% 52.14n ± 1% -10.75% (p=0.000 n=20) AppendFloat/64Fixed4-16 56.52n ± 1% 50.27n ± 1% -11.07% (p=0.000 n=20) AppendFloat/64Fixed5Hard-16 97.79n ± 1% 57.86n ± 1% -40.83% (p=0.000 n=20) AppendFloat/64Fixed12-16 90.78n ± 1% 83.01n ± 1% -8.56% (p=0.000 n=20) AppendFloat/64Fixed16-16 76.11n ± 1% 70.84n ± 0% -6.92% (p=0.000 n=20) AppendFloat/64Fixed12Hard-16 73.56n ± 1% 68.98n ± 2% -6.23% (p=0.000 n=20) AppendFloat/64Fixed17Hard-16 83.20n ± 1% 79.85n ± 1% -4.03% (p=0.000 n=20) AppendFloat/64Fixed18Hard-16 4.947µ ± 1% 4.915µ ± 1% ~ (p=0.229 n=20) AppendFloat/64FixedF1-16 242.4n ± 1% 239.4n ± 1% ~ (p=0.038 n=20) AppendFloat/64FixedF2-16 257.7n ± 2% 252.6n ± 1% -1.98% (p=0.000 n=20) AppendFloat/64FixedF3-16 237.5n ± 0% 237.5n ± 1% ~ (p=0.440 n=20) AppendFloat/Slowpath64-16 99.75n ± 1% 99.78n ± 1% ~ (p=0.995 n=20) AppendFloat/SlowpathDenormal64-16 97.41n ± 1% 98.20n ± 1% ~ (p=0.006 n=20) geomean 100.7n 95.60n -5.05% host: s7 cpu: AMD Ryzen 9 7950X 16-Core Processor │ 14b7e09f493 │ f9bf7fcb8e2 │ │ sec/op │ sec/op vs base │ AppendFloat/Decimal-32 22.19n ± 0% 22.04n ± 0% -0.68% (p=0.000 n=20) AppendFloat/Float-32 34.59n ± 0% 34.88n ± 0% +0.84% (p=0.000 n=20) AppendFloat/Exp-32 34.47n ± 0% 34.88n ± 0% +1.20% (p=0.000 n=20) AppendFloat/NegExp-32 34.85n ± 0% 35.32n ± 0% +1.35% (p=0.000 n=20) AppendFloat/LongExp-32 37.23n ± 0% 37.09n ± 0% ~ (p=0.003 n=20) AppendFloat/Big-32 39.27n ± 0% 38.50n ± 0% -1.97% (p=0.000 n=20) AppendFloat/BinaryExp-32 17.38n ± 0% 17.61n ± 0% +1.35% (p=0.000 n=20) AppendFloat/32Integer-32 22.26n ± 0% 22.08n ± 0% -0.79% (p=0.000 n=20) AppendFloat/32ExactFraction-32 32.82n ± 0% 32.91n ± 0% ~ (p=0.018 n=20) AppendFloat/32Point-32 32.88n ± 0% 33.22n ± 0% +1.03% (p=0.000 n=20) AppendFloat/32Exp-32 34.95n ± 0% 34.62n ± 0% -0.94% (p=0.000 n=20) AppendFloat/32NegExp-32 33.23n ± 0% 33.55n ± 0% +0.98% (p=0.000 n=20) AppendFloat/32Shortest-32 30.19n ± 0% 30.12n ± 0% ~ (p=0.122 n=20) AppendFloat/32Fixed8Hard-32 22.94n ± 0% 22.88n ± 0% ~ (p=0.124 n=20) AppendFloat/32Fixed9Hard-32 26.20n ± 0% 25.94n ± 1% -0.97% (p=0.000 n=20) AppendFloat/64Fixed1-32 21.10n ± 0% 18.84n ± 0% -10.71% (p=0.000 n=20) AppendFloat/64Fixed2-32 20.75n ± 0% 18.70n ± 0% -9.88% (p=0.000 n=20) AppendFloat/64Fixed2.5-32 21.07n ± 0% 17.96n ± 0% -14.74% (p=0.000 n=20) AppendFloat/64Fixed3-32 21.24n ± 0% 19.64n ± 0% -7.53% (p=0.000 n=20) AppendFloat/64Fixed4-32 20.63n ± 0% 18.61n ± 0% -9.79% (p=0.000 n=20) AppendFloat/64Fixed5Hard-32 34.48n ± 0% 21.70n ± 0% -37.06% (p=0.000 n=20) AppendFloat/64Fixed12-32 32.26n ± 0% 30.87n ± 1% -4.31% (p=0.000 n=20) AppendFloat/64Fixed16-32 27.95n ± 0% 26.86n ± 0% -3.92% (p=0.000 n=20) AppendFloat/64Fixed12Hard-32 27.30n ± 0% 25.98n ± 1% -4.82% (p=0.000 n=20) AppendFloat/64Fixed17Hard-32 30.80n ± 0% 29.93n ± 0% -2.84% (p=0.000 n=20) AppendFloat/64Fixed18Hard-32 1.833µ ± 0% 1.831µ ± 0% ~ (p=0.663 n=20) AppendFloat/64FixedF1-32 83.42n ± 1% 84.00n ± 1% ~ (p=0.003 n=20) AppendFloat/64FixedF2-32 90.10n ± 0% 89.23n ± 1% -0.95% (p=0.001 n=20) AppendFloat/64FixedF3-32 84.42n ± 1% 84.39n ± 0% ~ (p=0.878 n=20) AppendFloat/Slowpath64-32 35.72n ± 0% 35.59n ± 0% ~ (p=0.007 n=20) AppendFloat/SlowpathDenormal64-32 35.36n ± 0% 35.05n ± 0% -0.88% (p=0.000 n=20) geomean 36.05n 34.69n -3.77% host: linux-386 goarch: 386 cpu: Intel(R) Xeon(R) CPU @ 2.30GHz │ 14b7e09f493 │ f9bf7fcb8e2 │ │ sec/op │ sec/op vs base │ AppendFloat/Decimal-16 132.8n ± 0% 133.5n ± 0% +0.49% (p=0.001 n=20) AppendFloat/Float-16 242.6n ± 0% 241.7n ± 0% -0.37% (p=0.000 n=20) AppendFloat/Exp-16 252.2n ± 0% 249.1n ± 0% -1.27% (p=0.000 n=20) AppendFloat/NegExp-16 253.6n ± 0% 247.7n ± 0% -2.33% (p=0.000 n=20) AppendFloat/LongExp-16 260.9n ± 0% 257.1n ± 0% -1.48% (p=0.000 n=20) AppendFloat/Big-16 293.7n ± 0% 285.2n ± 0% -2.89% (p=0.000 n=20) AppendFloat/BinaryExp-16 89.63n ± 1% 89.06n ± 0% -0.64% (p=0.000 n=20) AppendFloat/32Integer-16 132.6n ± 0% 133.2n ± 0% ~ (p=0.016 n=20) AppendFloat/32ExactFraction-16 216.9n ± 0% 214.2n ± 0% -1.24% (p=0.000 n=20) AppendFloat/32Point-16 205.0n ± 0% 202.2n ± 0% -1.37% (p=0.000 n=20) AppendFloat/32Exp-16 250.2n ± 0% 235.9n ± 0% -5.72% (p=0.000 n=20) AppendFloat/32NegExp-16 213.5n ± 0% 210.6n ± 0% -1.34% (p=0.000 n=20) AppendFloat/32Shortest-16 198.3n ± 0% 197.8n ± 0% ~ (p=0.147 n=20) AppendFloat/32Fixed8Hard-16 114.9n ± 1% 136.0n ± 1% +18.46% (p=0.000 n=20) AppendFloat/32Fixed9Hard-16 189.8n ± 0% 155.0n ± 1% -18.31% (p=0.000 n=20) AppendFloat/64Fixed1-16 175.8n ± 0% 132.7n ± 0% -24.52% (p=0.000 n=20) AppendFloat/64Fixed2-16 166.6n ± 0% 128.7n ± 0% -22.73% (p=0.000 n=20) AppendFloat/64Fixed2.5-16 176.5n ± 0% 126.8n ± 0% -28.11% (p=0.000 n=20) AppendFloat/64Fixed3-16 165.3n ± 0% 127.1n ± 0% -23.11% (p=0.000 n=20) AppendFloat/64Fixed4-16 141.3n ± 0% 120.8n ± 1% -14.51% (p=0.000 n=20) AppendFloat/64Fixed5Hard-16 344.6n ± 0% 136.0n ± 0% -60.51% (p=0.000 n=20) AppendFloat/64Fixed12-16 184.2n ± 0% 158.7n ± 0% -13.82% (p=0.000 n=20) AppendFloat/64Fixed16-16 174.0n ± 0% 151.3n ± 0% -12.99% (p=0.000 n=20) AppendFloat/64Fixed12Hard-16 169.7n ± 0% 146.7n ± 0% -13.58% (p=0.000 n=20) AppendFloat/64Fixed17Hard-16 207.7n ± 0% 166.6n ± 0% -19.81% (p=0.000 n=20) AppendFloat/64Fixed18Hard-16 10.66µ ± 0% 10.63µ ± 0% ~ (p=0.030 n=20) AppendFloat/64FixedF1-16 615.9n ± 0% 613.5n ± 0% -0.40% (p=0.000 n=20) AppendFloat/64FixedF2-16 846.6n ± 0% 847.4n ± 0% ~ (p=0.551 n=20) AppendFloat/64FixedF3-16 609.9n ± 0% 609.5n ± 0% ~ (p=0.213 n=20) AppendFloat/Slowpath64-16 254.1n ± 0% 252.6n ± 1% ~ (p=0.048 n=20) AppendFloat/SlowpathDenormal64-16 251.5n ± 0% 249.4n ± 0% -0.83% (p=0.000 n=20) geomean 249.2n 225.4n -9.54% host: s7:GOARCH=386 cpu: AMD Ryzen 9 7950X 16-Core Processor │ 14b7e09f493 │ f9bf7fcb8e2 │ │ sec/op │ sec/op vs base │ AppendFloat/Decimal-32 42.65n ± 0% 42.31n ± 0% -0.79% (p=0.000 n=20) AppendFloat/Float-32 71.56n ± 0% 71.06n ± 0% -0.69% (p=0.000 n=20) AppendFloat/Exp-32 75.61n ± 1% 74.85n ± 1% -1.01% (p=0.000 n=20) AppendFloat/NegExp-32 74.36n ± 0% 74.30n ± 0% ~ (p=0.482 n=20) AppendFloat/LongExp-32 75.82n ± 0% 75.73n ± 0% ~ (p=0.490 n=20) AppendFloat/Big-32 85.10n ± 0% 82.61n ± 0% -2.93% (p=0.000 n=20) AppendFloat/BinaryExp-32 33.02n ± 0% 32.48n ± 1% -1.64% (p=0.000 n=20) AppendFloat/32Integer-32 41.54n ± 1% 41.27n ± 1% -0.66% (p=0.000 n=20) AppendFloat/32ExactFraction-32 62.48n ± 0% 62.91n ± 0% +0.69% (p=0.000 n=20) AppendFloat/32Point-32 60.17n ± 0% 60.65n ± 0% +0.80% (p=0.000 n=20) AppendFloat/32Exp-32 73.34n ± 0% 68.99n ± 0% -5.92% (p=0.000 n=20) AppendFloat/32NegExp-32 63.29n ± 0% 62.83n ± 0% -0.73% (p=0.000 n=20) AppendFloat/32Shortest-32 58.97n ± 0% 59.07n ± 0% ~ (p=0.029 n=20) AppendFloat/32Fixed8Hard-32 37.42n ± 0% 41.76n ± 1% +11.61% (p=0.000 n=20) AppendFloat/32Fixed9Hard-32 55.18n ± 0% 50.13n ± 1% -9.16% (p=0.000 n=20) AppendFloat/64Fixed1-32 50.89n ± 1% 41.25n ± 0% -18.94% (p=0.000 n=20) AppendFloat/64Fixed2-32 48.33n ± 1% 40.85n ± 1% -15.48% (p=0.000 n=20) AppendFloat/64Fixed2.5-32 52.46n ± 0% 39.39n ± 0% -24.92% (p=0.000 n=20) AppendFloat/64Fixed3-32 48.28n ± 1% 40.66n ± 0% -15.78% (p=0.000 n=20) AppendFloat/64Fixed4-32 44.57n ± 0% 38.58n ± 0% -13.44% (p=0.000 n=20) AppendFloat/64Fixed5Hard-32 96.16n ± 0% 42.99n ± 1% -55.29% (p=0.000 n=20) AppendFloat/64Fixed12-32 56.84n ± 0% 51.95n ± 1% -8.61% (p=0.000 n=20) AppendFloat/64Fixed16-32 54.23n ± 0% 49.33n ± 0% -9.03% (p=0.000 n=20) AppendFloat/64Fixed12Hard-32 53.47n ± 0% 48.67n ± 0% -8.99% (p=0.000 n=20) AppendFloat/64Fixed17Hard-32 61.76n ± 0% 55.42n ± 1% -10.27% (p=0.000 n=20) AppendFloat/64Fixed18Hard-32 3.998µ ± 1% 4.001µ ± 0% ~ (p=0.449 n=20) AppendFloat/64FixedF1-32 161.8n ± 0% 166.2n ± 1% +2.72% (p=0.000 n=20) AppendFloat/64FixedF2-32 223.4n ± 2% 226.2n ± 1% +1.25% (p=0.000 n=20) AppendFloat/64FixedF3-32 159.6n ± 0% 161.6n ± 1% +1.22% (p=0.000 n=20) AppendFloat/Slowpath64-32 76.69n ± 0% 75.03n ± 0% -2.16% (p=0.000 n=20) AppendFloat/SlowpathDenormal64-32 75.02n ± 0% 74.36n ± 1% ~ (p=0.003 n=20) geomean 74.66n 69.39n -7.06% Change-Id: I9db46471a93bd2aab3c2796e563d154cb531d4cb Reviewed-on: https://go-review.googlesource.com/c/go/+/717182 Reviewed-by: Alan Donovan LUCI-TryBot-Result: Go LUCI Auto-Submit: Russ Cox --- diff --git a/src/internal/strconv/atoi.go b/src/internal/strconv/atoi.go index 5bc259e7e5..4bbcb4f5da 100644 --- a/src/internal/strconv/atoi.go +++ b/src/internal/strconv/atoi.go @@ -41,8 +41,6 @@ const intSize = 32 << (^uint(0) >> 63) // IntSize is the size in bits of an int or uint value. const IntSize = intSize -const maxUint64 = 1<<64 - 1 - // ParseUint is like [ParseInt] but for unsigned numbers. // // A sign prefix is not permitted. diff --git a/src/internal/strconv/export_test.go b/src/internal/strconv/export_test.go index 86435f66cf..c879f24480 100644 --- a/src/internal/strconv/export_test.go +++ b/src/internal/strconv/export_test.go @@ -18,6 +18,9 @@ var ( Pow10 = pow10 Umul128 = umul128 Umul192 = umul192 + Div5Tab = div5Tab + DivisiblePow5 = divisiblePow5 + TrimZeros = trimZeros ) func NewDecimal(i uint64) *decimal { diff --git a/src/internal/strconv/ftoa.go b/src/internal/strconv/ftoa.go index 1aec5447ec..fd30f28289 100644 --- a/src/internal/strconv/ftoa.go +++ b/src/internal/strconv/ftoa.go @@ -123,16 +123,17 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte { return bigFtoa(dst, prec, fmt, neg, mant, exp, flt) } - var digs decimalSlice - ok := false // Negative precision means "only as much as needed to be exact." shortest := prec < 0 + var digs decimalSlice + if mant == 0 { + return formatDigits(dst, shortest, neg, digs, prec, fmt) + } if shortest { // Use Ryu algorithm. var buf [32]byte digs.d = buf[:] ryuFtoaShortest(&digs, mant, exp-int(flt.mantbits), flt) - ok = true // Precision for shortest representation mode. switch fmt { case 'e', 'E': @@ -142,7 +143,11 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte { case 'g', 'G': prec = digs.nd } - } else if fmt != 'f' { + return formatDigits(dst, shortest, neg, digs, prec, fmt) + } + + // TODO figure out when we can use fast code for f + if fmt != 'f' { // Fixed number of digits. digits := prec switch fmt { @@ -157,21 +162,15 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte { // Invalid mode. digits = 1 } - var buf [24]byte - if bitSize == 32 && digits <= 9 { + if digits <= 18 { + var buf [24]byte digs.d = buf[:] - ryuFtoaFixed32(&digs, uint32(mant), exp-int(flt.mantbits), digits) - ok = true - } else if digits <= 18 { - digs.d = buf[:] - ryuFtoaFixed64(&digs, mant, exp-int(flt.mantbits), digits) - ok = true + fixedFtoa(&digs, mant, exp-int(flt.mantbits), digits) + return formatDigits(dst, false, neg, digs, prec, fmt) } } - if !ok { - return bigFtoa(dst, prec, fmt, neg, mant, exp, flt) - } - return formatDigits(dst, shortest, neg, digs, prec, fmt) + + return bigFtoa(dst, prec, fmt, neg, mant, exp, flt) } // bigFtoa uses multiprecision computations to format a float. diff --git a/src/internal/strconv/ftoa_test.go b/src/internal/strconv/ftoa_test.go index d510629537..4e6f462928 100644 --- a/src/internal/strconv/ftoa_test.go +++ b/src/internal/strconv/ftoa_test.go @@ -177,6 +177,16 @@ var ftoatests = []ftoaTest{ {1.801439850948199e+16, 'g', -1, "1.801439850948199e+16"}, {5.960464477539063e-08, 'g', -1, "5.960464477539063e-08"}, {1.012e-320, 'g', -1, "1.012e-320"}, + + // Cases from TestFtoaRandom that caught bugs in fixedFtoa. + {8177880169308380. * (1 << 1), 'e', 14, "1.63557603386168e+16"}, + {8393378656576888. * (1 << 1), 'e', 15, "1.678675731315378e+16"}, + {8738676561280626. * (1 << 4), 'e', 16, "1.3981882498049002e+17"}, + {8291032395191335. / (1 << 30), 'e', 5, "7.72163e+06"}, + + // Exercise divisiblePow5 case in fixedFtoa + {2384185791015625. * (1 << 12), 'e', 5, "9.76562e+18"}, + {2384185791015625. * (1 << 13), 'e', 5, "1.95312e+19"}, } func TestFtoa(t *testing.T) { @@ -253,7 +263,7 @@ func TestFtoaRandom(t *testing.T) { shortSlow = FormatFloat(x, 'e', prec, 64) SetOptimize(true) if shortSlow != shortFast { - t.Errorf("%b printed as %s, want %s", x, shortFast, shortSlow) + t.Errorf("%b printed with %%.%de as %s, want %s", x, prec, shortFast, shortSlow) } } } diff --git a/src/internal/strconv/ftoafixed.go b/src/internal/strconv/ftoafixed.go new file mode 100644 index 0000000000..f3542d1cf5 --- /dev/null +++ b/src/internal/strconv/ftoafixed.go @@ -0,0 +1,156 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strconv + +import "math/bits" + +var uint64pow10 = [...]uint64{ + 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, + 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, +} + +// fixedFtoa formats a number of decimal digits of mant*(2^exp) into d, +// where mant > 0 and 1 ≤ digits ≤ 18. +func fixedFtoa(d *decimalSlice, mant uint64, exp, digits int) { + // The strategy here is to multiply (mant * 2^exp) by a power of 10 + // to make the resulting integer be the number of digits we want. + // + // Adams proved in the Ryu paper that 128-bit precision in the + // power-of-10 constant is sufficient to produce correctly + // rounded output for all float64s, up to 18 digits. + // https://dl.acm.org/doi/10.1145/3192366.3192369 + // + // TODO(rsc): The paper is not focused on, nor terribly clear about, + // this fact in this context, and the proof seems too complicated. + // Post a shorter, more direct proof and link to it here. + + if digits > 18 { + panic("fixedFtoa called with digits > 18") + } + + // Shift mantissa to have 64 bits, + // so that the 192-bit product below will + // have at least 63 bits in its top word. + b := 64 - bits.Len64(mant) + mant <<= b + exp -= b + + // We have f = mant * 2^exp ≥ 2^(63+exp) + // and we want to multiply it by some 10^p + // to make it have the number of digits plus one rounding bit: + // + // 2 * 10^(digits-1) ≤ f * 10^p < ~2 * 10^digits + // + // The lower bound is required, but the upper bound is approximate: + // we must not have too few digits, but we can round away extra ones. + // + // f * 10^p ≥ 2 * 10^(digits-1) + // 10^p ≥ 2 * 10^(digits-1) / f [dividing by f] + // p ≥ (log₁₀ 2) + (digits-1) - log₁₀ f [taking log₁₀] + // p ≥ (log₁₀ 2) + (digits-1) - log₁₀ (mant * 2^exp) [expanding f] + // p ≥ (log₁₀ 2) + (digits-1) - (log₁₀ 2) * (64 + exp) [mant < 2⁶⁴] + // p ≥ (digits - 1) - (log₁₀ 2) * (63 + exp) [refactoring] + // + // Once we have p, we can compute the scaled value: + // + // dm * 2^de = mant * 2^exp * 10^p + // = mant * 2^exp * pow/2^128 * 2^exp2. + // = (mant * pow/2^128) * 2^(exp+exp2). + p := (digits - 1) - mulLog10_2(63+exp) + pow, exp2, ok := pow10(p) + if !ok { + // This never happens due to the range of float32/float64 exponent + panic("fixedFtoa: pow10 out of range") + } + if -22 <= p && p < 0 { + // Special case: Let q=-p. q is in [1,22]. We are dividing by 10^q + // and the mantissa may be a multiple of 5^q (5^22 < 2^53), + // in which case the division must be computed exactly and + // recorded as exact for correct rounding. Our normal computation is: + // + // dm = floor(mant * floor(10^p * 2^s)) + // + // for some scaling shift s. To make this an exact division, + // it suffices to change the inner floor to a ceil: + // + // dm = floor(mant * ceil(10^p * 2^s)) + // + // In the range of values we are using, the floor and ceil + // cancel each other out and the high 64 bits of the product + // come out exactly right. + // (This is the same trick compilers use for division by constants. + // See Hacker's Delight, 2nd ed., Chapter 10.) + pow.Lo++ + } + dm, lo1, lo0 := umul192(mant, pow) + de := exp + exp2 + + // Check whether any bits have been truncated from dm. + // If so, set dt != 0. If not, leave dt == 0 (meaning dm is exact). + var dt uint + switch { + default: + // Most powers of 10 use a truncated constant, + // meaning the result is also truncated. + dt = 1 + case 0 <= p && p <= 55: + // Small positive powers of 10 (up to 10⁵⁵) can be represented + // precisely in a 128-bit mantissa (5⁵⁵ ≤ 2¹²⁸), so the only truncation + // comes from discarding the low bits of the 192-bit product. + // + // TODO(rsc): The new proof mentioned above should also + // prove that we can't have lo1 == 0 and lo0 != 0. + // After proving that, drop computation and use of lo0 here. + dt = bool2uint(lo1|lo0 != 0) + case -22 <= p && p < 0 && divisiblePow5(mant, -p): + // If the original mantissa was a multiple of 5^p, + // the result is exact. (See comment above for pow.Lo++.) + dt = 0 + } + + // The value we want to format is dm * 2^de, where de < 0. + // Multply by 2^de by shifting, but leave one extra bit for rounding. + // After the shift, the "integer part" of dm is dm>>1, + // the "rounding bit" (the first fractional bit) is dm&1, + // and the "truncated bit" (have any bits been discarded?) is dt. + shift := -de - 1 + dt |= bool2uint(dm&(1<>= shift + + // Set decimal point in eventual formatted digits, + // so we can update it as we adjust the digits. + d.dp = digits - p + + // Trim excess digit if any, updating truncation and decimal point. + // The << 1 is leaving room for the rounding bit. + max := uint64pow10[digits] << 1 + if dm >= max { + var r uint + dm, r = dm/10, uint(dm%10) + dt |= bool2uint(r != 0) + d.dp++ + } + + // Round and shift away rounding bit. + // We want to round up when + // (a) the fractional part is > 0.5 (dm&1 != 0 and dt == 1) + // (b) or the fractional part is ≥ 0.5 and the integer part is odd + // (dm&1 != 0 and dm&2 != 0). + // The bitwise expression encodes that logic. + dm += uint64(uint(dm) & (dt | uint(dm)>>1) & 1) + dm >>= 1 + if dm == max>>1 { + // 999... rolled over to 1000... + dm = uint64pow10[digits-1] + d.dp++ + } + + // Format digits into d. + formatBase10(d.d[:digits], dm) + d.nd = digits + for d.d[d.nd-1] == '0' { + d.nd-- + } +} diff --git a/src/internal/strconv/ftoaryu.go b/src/internal/strconv/ftoaryu.go index 999af51502..9407bfec44 100644 --- a/src/internal/strconv/ftoaryu.go +++ b/src/internal/strconv/ftoaryu.go @@ -4,203 +4,11 @@ package strconv -import ( - "math/bits" -) +import "math/bits" // binary to decimal conversion using the RyÅ« algorithm. // // See Ulf Adams, "RyÅ«: Fast Float-to-String Conversion" (doi:10.1145/3192366.3192369) -// -// Fixed precision formatting is a variant of the original paper's -// algorithm, where a single multiplication by 10^k is required, -// sharing the same rounding guarantees. - -// ryuFtoaFixed32 formats mant*(2^exp) with prec decimal digits. -func ryuFtoaFixed32(d *decimalSlice, mant uint32, exp int, prec int) { - if prec < 0 { - panic("ryuFtoaFixed32 called with negative prec") - } - if prec > 9 { - panic("ryuFtoaFixed32 called with prec > 9") - } - // Zero input. - if mant == 0 { - d.nd, d.dp = 0, 0 - return - } - // Renormalize to a 25-bit mantissa. - e2 := exp - if b := bits.Len32(mant); b < 25 { - mant <<= uint(25 - b) - e2 += b - 25 - } - // Choose an exponent such that rounded mant*(2^e2)*(10^q) has - // at least prec decimal digits, i.e - // mant*(2^e2)*(10^q) >= 10^(prec-1) - // Because mant >= 2^24, it is enough to choose: - // 2^(e2+24) >= 10^(-q+prec-1) - // or q = -mulLog10_2(e2+24) + prec - 1 - q := -mulLog10_2(e2+24) + prec - 1 - - // Now compute mant*(2^e2)*(10^q). - // Is it an exact computation? - // Only small positive powers of 10 are exact (5^28 has 66 bits). - exact := q <= 27 && q >= 0 - - di, dexp2, d0 := mult64bitPow10(mant, e2, q) - if dexp2 >= 0 { - panic("not enough significant bits after mult64bitPow10") - } - // As a special case, computation might still be exact, if exponent - // was negative and if it amounts to computing an exact division. - // In that case, we ignore all lower bits. - // Note that division by 10^11 cannot be exact as 5^11 has 26 bits. - if q < 0 && q >= -10 && divisibleByPower5(uint64(mant), -q) { - exact = true - d0 = true - } - // Remove extra lower bits and keep rounding info. - extra := uint(-dexp2) - extraMask := uint32(1<>extra, di&extraMask - roundUp := false - if exact { - // If we computed an exact product, d + 1/2 - // should round to d+1 if 'd' is odd. - roundUp = dfrac > 1<<(extra-1) || - (dfrac == 1<<(extra-1) && !d0) || - (dfrac == 1<<(extra-1) && d0 && di&1 == 1) - } else { - // otherwise, d+1/2 always rounds up because - // we truncated below. - roundUp = dfrac>>(extra-1) == 1 - } - if dfrac != 0 { - d0 = false - } - // Proceed to the requested number of digits - formatDecimal(d, uint64(di), !d0, roundUp, prec) - // Adjust exponent - d.dp -= q -} - -// ryuFtoaFixed64 formats mant*(2^exp) with prec decimal digits. -func ryuFtoaFixed64(d *decimalSlice, mant uint64, exp int, prec int) { - if prec > 18 { - panic("ryuFtoaFixed64 called with prec > 18") - } - // Zero input. - if mant == 0 { - d.nd, d.dp = 0, 0 - return - } - // Renormalize to a 55-bit mantissa. - e2 := exp - if b := bits.Len64(mant); b < 55 { - mant = mant << uint(55-b) - e2 += b - 55 - } - // Choose an exponent such that rounded mant*(2^e2)*(10^q) has - // at least prec decimal digits, i.e - // mant*(2^e2)*(10^q) >= 10^(prec-1) - // Because mant >= 2^54, it is enough to choose: - // 2^(e2+54) >= 10^(-q+prec-1) - // or q = -mulLog10_2(e2+54) + prec - 1 - // - // The minimal required exponent is -mulLog10_2(1025)+18 = -291 - // The maximal required exponent is mulLog10_2(1074)+18 = 342 - q := -mulLog10_2(e2+54) + prec - 1 - - // Now compute mant*(2^e2)*(10^q). - // Is it an exact computation? - // Only small positive powers of 10 are exact (5^55 has 128 bits). - exact := q <= 55 && q >= 0 - - di, dexp2, d0 := mult128bitPow10(mant, e2, q) - if dexp2 >= 0 { - panic("not enough significant bits after mult128bitPow10") - } - // As a special case, computation might still be exact, if exponent - // was negative and if it amounts to computing an exact division. - // In that case, we ignore all lower bits. - // Note that division by 10^23 cannot be exact as 5^23 has 54 bits. - if q < 0 && q >= -22 && divisibleByPower5(mant, -q) { - exact = true - d0 = true - } - // Remove extra lower bits and keep rounding info. - extra := uint(-dexp2) - extraMask := uint64(1<>extra, di&extraMask - roundUp := false - if exact { - // If we computed an exact product, d + 1/2 - // should round to d+1 if 'd' is odd. - roundUp = dfrac > 1<<(extra-1) || - (dfrac == 1<<(extra-1) && !d0) || - (dfrac == 1<<(extra-1) && d0 && di&1 == 1) - } else { - // otherwise, d+1/2 always rounds up because - // we truncated below. - roundUp = dfrac>>(extra-1) == 1 - } - if dfrac != 0 { - d0 = false - } - // Proceed to the requested number of digits - formatDecimal(d, di, !d0, roundUp, prec) - // Adjust exponent - d.dp -= q -} - -var uint64pow10 = [...]uint64{ - 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, - 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, -} - -// formatDecimal fills d with at most prec decimal digits -// of mantissa m. The boolean trunc indicates whether m -// is truncated compared to the original number being formatted. -func formatDecimal(d *decimalSlice, m uint64, trunc bool, roundUp bool, prec int) { - max := uint64pow10[prec] - trimmed := 0 - for m >= max { - a, b := m/10, m%10 - m = a - trimmed++ - if b > 5 { - roundUp = true - } else if b < 5 { - roundUp = false - } else { // b == 5 - // round up if there are trailing digits, - // or if the new value of m is odd (round-to-even convention) - roundUp = trunc || m&1 == 1 - } - if b != 0 { - trunc = true - } - } - if roundUp { - m++ - } - if m >= max { - // Happens if di was originally 99999....xx - m /= 10 - trimmed++ - } - // render digits - formatBase10(d.d[:prec], m) - d.nd = prec - for d.d[d.nd-1] == '0' { - d.nd-- - trimmed++ - } - d.dp = d.nd + trimmed -} // ryuFtoaShortest formats mant*2^exp with prec decimal digits. func ryuFtoaShortest(d *decimalSlice, mant uint64, exp int, flt *floatInfo) { @@ -249,13 +57,13 @@ func ryuFtoaShortest(d *decimalSlice, mant uint64, exp int, flt *floatInfo) { if q < 0 && q >= -24 { // Division by a power of ten may be exact. // (note that 5^25 is a 59-bit number so division by 5^25 is never exact). - if divisibleByPower5(ml, -q) { + if divisiblePow5(ml, -q) { dl0 = true } - if divisibleByPower5(mc, -q) { + if divisiblePow5(mc, -q) { dc0 = true } - if divisibleByPower5(mu, -q) { + if divisiblePow5(mu, -q) { du0 = true } } @@ -497,16 +305,3 @@ func mult128bitPow10(m uint64, e2, q int) (resM uint64, resE int, exact bool) { hi, mid, lo := umul192(m, pow) return hi<<9 | mid>>55, e2, mid<<9 == 0 && lo == 0 } - -func divisibleByPower5(m uint64, k int) bool { - if m == 0 { - return true - } - for i := 0; i < k; i++ { - if m%5 != 0 { - return false - } - m /= 5 - } - return true -} diff --git a/src/internal/strconv/import_test.go b/src/internal/strconv/import_test.go index ed1015ee5d..3dab2bf9e5 100644 --- a/src/internal/strconv/import_test.go +++ b/src/internal/strconv/import_test.go @@ -20,4 +20,7 @@ var ( pow10 = Pow10 umul128 = Umul128 umul192 = Umul192 + div5Tab = Div5Tab + divisiblePow5 = DivisiblePow5 + trimZeros = TrimZeros ) diff --git a/src/internal/strconv/math.go b/src/internal/strconv/math.go index 37303d76db..3b884e846a 100644 --- a/src/internal/strconv/math.go +++ b/src/internal/strconv/math.go @@ -56,3 +56,124 @@ func mulLog2_10(x int) int { // log(10)/log(2) ≈ 3.32192809489 ≈ 108853 / 2^15 return (x * 108853) >> 15 } + +func bool2uint(b bool) uint { + if b { + return 1 + } + return 0 +} + +// Exact Division and Remainder Checking +// +// An exact division x/c (exact means x%c == 0) +// can be implemented by x*m where m is the multiplicative inverse of c (m*c == 1). +// +// Since c is also the multiplicative inverse of m, x*m is lossless, +// and all the exact multiples of c map to all of [0, maxUint64/c]. +// The non-multiples are forced to map to larger values. +// This also gives a quick test for whether x is an exact multiple of c: +// compute the exact division and check whether it's at most maxUint64/c: +// x%c == 0 => x*m <= maxUint64/c. +// +// Only odd c have multiplicative inverses mod powers of two. +// To do an exact divide x / (c<>s instead. +// And to check for remainder, we need to check that those low s +// bits are all zero before we shift them away. We can merge that +// with the <= for the exact odd remainder check by rotating the +// shifted bits into the high part instead: +// x%(c< bits.RotateLeft64(x*m, -s) <= maxUint64/c. +// +// The compiler does this transformation automatically in general, +// but we apply it here by hand in a few ways that the compiler can't help with. +// +// For a more detailed explanation, see +// Henry S. Warren, Jr., Hacker's Delight, 2nd ed., sections 10-16 and 10-17. + +// divisiblePow5 reports whether x is divisible by 5^p. +// It returns false for p not in [1, 22], +// because we only care about float64 mantissas, and 5^23 > 2^53. +func divisiblePow5(x uint64, p int) bool { + return 1 <= p && p <= 22 && x*div5Tab[p-1][0] <= div5Tab[p-1][1] +} + +const maxUint64 = 1<<64 - 1 + +// div5Tab[p-1] is the multiplicative inverse of 5^p and maxUint64/5^p. +var div5Tab = [22][2]uint64{ + {0xcccccccccccccccd, maxUint64 / 5}, + {0x8f5c28f5c28f5c29, maxUint64 / 5 / 5}, + {0x1cac083126e978d5, maxUint64 / 5 / 5 / 5}, + {0xd288ce703afb7e91, maxUint64 / 5 / 5 / 5 / 5}, + {0x5d4e8fb00bcbe61d, maxUint64 / 5 / 5 / 5 / 5 / 5}, + {0x790fb65668c26139, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5}, + {0xe5032477ae8d46a5, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0xc767074b22e90e21, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x8e47ce423a2e9c6d, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x4fa7f60d3ed61f49, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x0fee64690c913975, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x3662e0e1cf503eb1, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0xa47a2cf9f6433fbd, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x54186f653140a659, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x7738164770402145, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0xe4a4d1417cd9a041, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0xc75429d9e5c5200d, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0xc1773b91fac10669, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x26b172506559ce15, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0xd489e3a9addec2d1, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x90e860bb892c8d5d, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, + {0x502e79bf1b6f4f79, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5}, +} + +// trimZeros trims trailing zeros from x. +// It finds the largest p such that x % 10^p == 0 +// and then returns x / 10^p, p. +// +// This is here for reference and tested, because it is an optimization +// used by other ftoa algorithms, but in our implementations it has +// never been benchmarked to be faster than trimming zeros after +// formatting into decimal bytes. +func trimZeros(x uint64) (uint64, int) { + const ( + div1e8m = 0xc767074b22e90e21 + div1e8le = maxUint64 / 100000000 + + div1e4m = 0xd288ce703afb7e91 + div1e4le = maxUint64 / 10000 + + div1e2m = 0x8f5c28f5c28f5c29 + div1e2le = maxUint64 / 100 + + div1e1m = 0xcccccccccccccccd + div1e1le = maxUint64 / 10 + ) + + // _ = assert[x - y] asserts at compile time that x == y. + // Assert that the multiplicative inverses are correct + // by checking that (div1eNm * 5^N) % 1<<64 == 1. + var assert [1]struct{} + _ = assert[(div1e8m*5*5*5*5*5*5*5*5)%(1<<64)-1] + _ = assert[(div1e4m*5*5*5*5)%(1<<64)-1] + _ = assert[(div1e2m*5*5)%(1<<64)-1] + _ = assert[(div1e1m*5)%(1<<64)-1] + + // Cut 8 zeros, then 4, then 2, then 1. + p := 0 + for d := bits.RotateLeft64(x*div1e8m, -8); d <= div1e8le; d = bits.RotateLeft64(x*div1e8m, -8) { + x = d + p += 8 + } + if d := bits.RotateLeft64(x*div1e4m, -4); d <= div1e4le { + x = d + p += 4 + } + if d := bits.RotateLeft64(x*div1e2m, -2); d <= div1e2le { + x = d + p += 2 + } + if d := bits.RotateLeft64(x*div1e1m, -1); d <= div1e1le { + x = d + p += 1 + } + return x, p +} diff --git a/src/internal/strconv/math_test.go b/src/internal/strconv/math_test.go index 3a1ff3400c..55e25f98cf 100644 --- a/src/internal/strconv/math_test.go +++ b/src/internal/strconv/math_test.go @@ -93,3 +93,73 @@ func TestMulLog2_10(t *testing.T) { } } } + +func pow5(p int) uint64 { + x := uint64(1) + for range p { + x *= 5 + } + return x +} + +func TestDivisiblePow5(t *testing.T) { + for p := 1; p <= 22; p++ { + x := pow5(p) + if divisiblePow5(1, p) { + t.Errorf("divisiblePow5(1, %d) = true, want, false", p) + } + if divisiblePow5(x-1, p) { + t.Errorf("divisiblePow5(%d, %d) = true, want false", x-1, p) + } + if divisiblePow5(x+1, p) { + t.Errorf("divisiblePow5(%d, %d) = true, want false", x-1, p) + } + if divisiblePow5(x/5, p) { + t.Errorf("divisiblePow5(%d, %d) = true, want false", x/5, p) + } + if !divisiblePow5(0, p) { + t.Errorf("divisiblePow5(0, %d) = false, want true", p) + } + if !divisiblePow5(x, p) { + t.Errorf("divisiblePow5(%d, %d) = false, want true", x, p) + } + if 2*x > x && !divisiblePow5(2*x, p) { + t.Errorf("divisiblePow5(%d, %d) = false, want true", 2*x, p) + } + } +} + +func TestDiv5Tab(t *testing.T) { + for p := 1; p <= 22; p++ { + m := div5Tab[p-1][0] + le := div5Tab[p-1][1] + + // See comment in math.go on div5Tab. + // m needs to be multiplicative inverse of pow5(p). + if m*pow5(p) != 1 { + t.Errorf("pow5Tab[%d-1][0] = %#x, but %#x * (5**%d) = %d, want 1", p, m, m, p, m*pow5(p)) + } + + // le needs to be ⌊(1<<64 - 1) / 5^p⌋. + want := (1<<64 - 1) / pow5(p) + if le != want { + t.Errorf("pow5Tab[%d-1][1] = %#x, want %#x", p, le, want) + } + } +} + +func TestTrimZeros(t *testing.T) { + for _, x := range []uint64{1, 2, 3, 4, 101, 123} { + want := x + for p := range 20 { + haveX, haveP := trimZeros(x) + if haveX != want || haveP != p { + t.Errorf("trimZeros(%d) = %d, %d, want %d, %d", x, haveX, haveP, want, p) + } + if x >= (1<<64-1)/10 { + break + } + x *= 10 + } + } +}