From: Russ Cox <rsc@golang.org>
Date: Sat, 1 Nov 2025 13:41:40 +0000 (-0400)
Subject: internal/strconv: extract fixed-precision ftoa from ftoaryu.go
X-Git-Tag: go1.26rc1~384
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=34fec512ce;p=gostls13.git

internal/strconv: extract fixed-precision ftoa from ftoaryu.go

The fixed-precision ftoa algorithm is not actually
documented in the Ryū paper, and it is fairly
straightforward: multiply by a power of 10 to get
an integer that contains the digits we need.
There is also no need for separate float32 and float64
implementations.

This CL implements a new fixedFtoa, separate from Ryū.
The overall algorithm is the same, but the new code
is simpler, faster, and better documented.

Now ftoaryu.go is only about shortest-output formatting,
so if and when yet another algorithm comes along, it will
be clearer what should be replaced (all of ftoaryu.go)
and what should not (all of ftoafixed.go).

benchmark \ host                  linux-arm64    local  linux-amd64       s7  linux-386  s7:GOARCH=386
                                      vs base  vs base      vs base  vs base    vs base        vs base
AppendFloat/Decimal                    -0.18%        ~            ~   -0.68%     +0.49%         -0.79%
AppendFloat/Float                      +0.09%        ~       +1.50%   +0.84%     -0.37%         -0.69%
AppendFloat/Exp                        -0.51%        ~            ~   +1.20%     -1.27%         -1.01%
AppendFloat/NegExp                     -1.01%        ~       +3.43%   +1.35%     -2.33%              ~
AppendFloat/LongExp                    -1.22%   +0.77%            ~        ~     -1.48%              ~
AppendFloat/Big                        -2.07%        ~       -2.07%   -1.97%     -2.89%         -2.93%
AppendFloat/BinaryExp                  -0.28%   +1.06%            ~   +1.35%     -0.64%         -1.64%
AppendFloat/32Integer                       ~        ~            ~   -0.79%          ~         -0.66%
AppendFloat/32ExactFraction            -0.50%        ~       +5.69%        ~     -1.24%         +0.69%
AppendFloat/32Point                         ~   -1.19%       +2.59%   +1.03%     -1.37%         +0.80%
AppendFloat/32Exp                      -3.39%   -2.79%       -8.36%   -0.94%     -5.72%         -5.92%
AppendFloat/32NegExp                   -0.63%        ~            ~   +0.98%     -1.34%         -0.73%
AppendFloat/32Shortest                 -1.00%   +1.36%       +2.94%        ~          ~              ~
AppendFloat/32Fixed8Hard               -5.91%  -12.45%       -6.62%        ~    +18.46%        +11.61%
AppendFloat/32Fixed9Hard               -6.53%  -11.35%       -6.01%   -0.97%    -18.31%         -9.16%
AppendFloat/64Fixed1                  -13.84%  -16.90%      -13.13%  -10.71%    -24.52%        -18.94%
AppendFloat/64Fixed2                  -11.12%  -16.97%      -12.13%   -9.88%    -22.73%        -15.48%
AppendFloat/64Fixed2.5                -21.98%  -20.75%      -19.08%  -14.74%    -28.11%        -24.92%
AppendFloat/64Fixed3                  -11.53%  -16.21%      -10.75%   -7.53%    -23.11%        -15.78%
AppendFloat/64Fixed4                  -12.89%  -12.36%      -11.07%   -9.79%    -14.51%        -13.44%
AppendFloat/64Fixed5Hard              -47.62%  -38.59%      -40.83%  -37.06%    -60.51%        -55.29%
AppendFloat/64Fixed12                  -7.40%        ~       -8.56%   -4.31%    -13.82%         -8.61%
AppendFloat/64Fixed16                  -9.10%   -8.95%       -6.92%   -3.92%    -12.99%         -9.03%
AppendFloat/64Fixed12Hard              -9.14%   -5.24%       -6.23%   -4.82%    -13.58%         -8.99%
AppendFloat/64Fixed17Hard              -6.80%        ~       -4.03%   -2.84%    -19.81%        -10.27%
AppendFloat/64Fixed18Hard              -0.12%        ~            ~        ~          ~              ~
AppendFloat/64FixedF1                       ~        ~            ~        ~     -0.40%         +2.72%
AppendFloat/64FixedF2                  -0.18%        ~       -1.98%   -0.95%          ~         +1.25%
AppendFloat/64FixedF3                  -0.29%        ~            ~        ~          ~         +1.22%
AppendFloat/Slowpath64                 -1.16%        ~            ~        ~          ~         -2.16%
AppendFloat/SlowpathDenormal64         -1.09%        ~            ~   -0.88%     -0.83%              ~

host: linux-arm64
goos: linux
goarch: arm64
pkg: internal/strconv
cpu: unknown
                                 │ 14b7e09f493  │             f9bf7fcb8e2             │
                                 │    sec/op    │   sec/op     vs base                │
AppendFloat/Decimal-8               60.35n ± 0%   60.24n ± 0%   -0.18% (p=0.000 n=20)
AppendFloat/Float-8                 88.83n ± 0%   88.91n ± 0%   +0.09% (p=0.000 n=20)
AppendFloat/Exp-8                   93.55n ± 0%   93.06n ± 0%   -0.51% (p=0.000 n=20)
AppendFloat/NegExp-8                94.01n ± 0%   93.06n ± 0%   -1.01% (p=0.000 n=20)
AppendFloat/LongExp-8              101.00n ± 0%   99.77n ± 0%   -1.22% (p=0.000 n=20)
AppendFloat/Big-8                   106.1n ± 0%   103.9n ± 0%   -2.07% (p=0.000 n=20)
AppendFloat/BinaryExp-8             47.48n ± 0%   47.35n ± 0%   -0.28% (p=0.000 n=20)
AppendFloat/32Integer-8             60.45n ± 0%   60.43n ± 0%        ~ (p=0.150 n=20)
AppendFloat/32ExactFraction-8       86.65n ± 0%   86.22n ± 0%   -0.50% (p=0.000 n=20)
AppendFloat/32Point-8               83.26n ± 0%   83.21n ± 0%        ~ (p=0.046 n=20)
AppendFloat/32Exp-8                 92.55n ± 0%   89.42n ± 0%   -3.39% (p=0.000 n=20)
AppendFloat/32NegExp-8              87.89n ± 0%   87.34n ± 0%   -0.63% (p=0.000 n=20)
AppendFloat/32Shortest-8            77.05n ± 0%   76.28n ± 0%   -1.00% (p=0.000 n=20)
AppendFloat/32Fixed8Hard-8          55.73n ± 0%   52.44n ± 0%   -5.91% (p=0.000 n=20)
AppendFloat/32Fixed9Hard-8          64.80n ± 0%   60.57n ± 0%   -6.53% (p=0.000 n=20)
AppendFloat/64Fixed1-8              53.72n ± 0%   46.29n ± 0%  -13.84% (p=0.000 n=20)
AppendFloat/64Fixed2-8              52.64n ± 0%   46.79n ± 0%  -11.12% (p=0.000 n=20)
AppendFloat/64Fixed2.5-8            56.01n ± 0%   43.70n ± 0%  -21.98% (p=0.000 n=20)
AppendFloat/64Fixed3-8              53.38n ± 0%   47.23n ± 0%  -11.53% (p=0.000 n=20)
AppendFloat/64Fixed4-8              50.62n ± 0%   44.10n ± 0%  -12.89% (p=0.000 n=20)
AppendFloat/64Fixed5Hard-8          98.94n ± 0%   51.82n ± 0%  -47.62% (p=0.000 n=20)
AppendFloat/64Fixed12-8             84.70n ± 0%   78.44n ± 0%   -7.40% (p=0.000 n=20)
AppendFloat/64Fixed16-8             71.68n ± 0%   65.16n ± 0%   -9.10% (p=0.000 n=20)
AppendFloat/64Fixed12Hard-8         68.41n ± 0%   62.16n ± 0%   -9.14% (p=0.000 n=20)
AppendFloat/64Fixed17Hard-8         79.31n ± 0%   73.92n ± 0%   -6.80% (p=0.000 n=20)
AppendFloat/64Fixed18Hard-8         4.290µ ± 0%   4.285µ ± 0%   -0.12% (p=0.000 n=20)
AppendFloat/64FixedF1-8             216.0n ± 0%   216.1n ± 0%        ~ (p=0.090 n=20)
AppendFloat/64FixedF2-8             228.2n ± 0%   227.8n ± 0%   -0.18% (p=0.000 n=20)
AppendFloat/64FixedF3-8             208.8n ± 0%   208.2n ± 0%   -0.29% (p=0.000 n=20)
AppendFloat/Slowpath64-8            98.56n ± 0%   97.42n ± 0%   -1.16% (p=0.000 n=20)
AppendFloat/SlowpathDenormal64-8    95.81n ± 0%   94.77n ± 0%   -1.09% (p=0.000 n=20)
geomean                             93.81n        87.87n        -6.33%

host: local
goos: darwin
cpu: Apple M3 Pro
                                  │ 14b7e09f493 │             f9bf7fcb8e2              │
                                  │   sec/op    │    sec/op     vs base                │
AppendFloat/Decimal-12              21.14n ± 0%   21.15n ±  0%        ~ (p=0.963 n=20)
AppendFloat/Float-12                32.48n ± 1%   32.43n ±  0%        ~ (p=0.358 n=20)
AppendFloat/Exp-12                  31.85n ± 0%   31.94n ±  1%        ~ (p=0.634 n=20)
AppendFloat/NegExp-12               31.75n ± 0%   32.04n ±  0%        ~ (p=0.004 n=20)
AppendFloat/LongExp-12              33.55n ± 0%   33.81n ±  0%   +0.77% (p=0.000 n=20)
AppendFloat/Big-12                  35.62n ± 1%   35.73n ±  1%        ~ (p=0.888 n=20)
AppendFloat/BinaryExp-12            19.26n ± 0%   19.46n ±  1%   +1.06% (p=0.000 n=20)
AppendFloat/32Integer-12            21.41n ± 0%   21.46n ±  1%        ~ (p=0.733 n=20)
AppendFloat/32ExactFraction-12      31.23n ± 1%   31.30n ±  1%        ~ (p=0.857 n=20)
AppendFloat/32Point-12              31.39n ± 1%   31.02n ±  0%   -1.19% (p=0.000 n=20)
AppendFloat/32Exp-12                32.42n ± 1%   31.52n ±  1%   -2.79% (p=0.000 n=20)
AppendFloat/32NegExp-12             30.66n ± 1%   30.66n ±  1%        ~ (p=0.380 n=20)
AppendFloat/32Shortest-12           26.88n ± 1%   27.25n ±  1%   +1.36% (p=0.000 n=20)
AppendFloat/32Fixed8Hard-12         19.52n ± 0%   17.09n ±  1%  -12.45% (p=0.000 n=20)
AppendFloat/32Fixed9Hard-12         21.55n ± 2%   19.11n ±  1%  -11.35% (p=0.000 n=20)
AppendFloat/64Fixed1-12             18.64n ± 0%   15.49n ±  0%  -16.90% (p=0.000 n=20)
AppendFloat/64Fixed2-12             18.65n ± 0%   15.49n ±  0%  -16.97% (p=0.000 n=20)
AppendFloat/64Fixed2.5-12           19.23n ± 1%   15.24n ±  0%  -20.75% (p=0.000 n=20)
AppendFloat/64Fixed3-12             18.61n ± 0%   15.59n ±  1%  -16.21% (p=0.000 n=20)
AppendFloat/64Fixed4-12             17.55n ± 1%   15.38n ±  0%  -12.36% (p=0.000 n=20)
AppendFloat/64Fixed5Hard-12         29.27n ± 1%   17.97n ±  0%  -38.59% (p=0.000 n=20)
AppendFloat/64Fixed12-12            28.26n ± 1%   28.17n ± 10%        ~ (p=0.941 n=20)
AppendFloat/64Fixed16-12            23.56n ± 0%   21.46n ±  0%   -8.95% (p=0.000 n=20)
AppendFloat/64Fixed12Hard-12        21.85n ± 2%   20.70n ±  1%   -5.24% (p=0.000 n=20)
AppendFloat/64Fixed17Hard-12        26.91n ± 1%   27.10n ±  0%        ~ (p=0.059 n=20)
AppendFloat/64Fixed18Hard-12        2.197µ ± 1%   2.169µ ±  1%        ~ (p=0.013 n=20)
AppendFloat/64FixedF1-12            103.7n ± 1%   103.3n ±  0%        ~ (p=0.035 n=20)
AppendFloat/64FixedF2-12            114.8n ± 1%   114.1n ±  1%        ~ (p=0.234 n=20)
AppendFloat/64FixedF3-12            107.8n ± 1%   107.1n ±  1%        ~ (p=0.180 n=20)
AppendFloat/Slowpath64-12           32.05n ± 1%   32.00n ±  0%        ~ (p=0.952 n=20)
AppendFloat/SlowpathDenormal64-12   29.98n ± 1%   30.20n ±  0%        ~ (p=0.004 n=20)
geomean                             33.83n        31.91n         -5.68%

host: linux-amd64
goos: linux
goarch: amd64
cpu: Intel(R) Xeon(R) CPU @ 2.30GHz
                                  │ 14b7e09f493  │             f9bf7fcb8e2              │
                                  │    sec/op    │    sec/op     vs base                │
AppendFloat/Decimal-16               64.00n ± 1%    63.67n ± 1%        ~ (p=0.784 n=20)
AppendFloat/Float-16                 95.99n ± 1%    97.42n ± 1%   +1.50% (p=0.000 n=20)
AppendFloat/Exp-16                   97.59n ± 1%    97.72n ± 1%        ~ (p=0.984 n=20)
AppendFloat/NegExp-16                97.80n ± 1%   101.15n ± 1%   +3.43% (p=0.000 n=20)
AppendFloat/LongExp-16               103.1n ± 1%    104.5n ± 1%        ~ (p=0.006 n=20)
AppendFloat/Big-16                   110.8n ± 1%    108.5n ± 1%   -2.07% (p=0.000 n=20)
AppendFloat/BinaryExp-16             47.82n ± 1%    47.33n ± 1%        ~ (p=0.007 n=20)
AppendFloat/32Integer-16             63.65n ± 1%    63.51n ± 0%        ~ (p=0.560 n=20)
AppendFloat/32ExactFraction-16       91.81n ± 1%    97.03n ± 1%   +5.69% (p=0.000 n=20)
AppendFloat/32Point-16               89.84n ± 1%    92.16n ± 1%   +2.59% (p=0.000 n=20)
AppendFloat/32Exp-16                103.80n ± 1%    95.12n ± 1%   -8.36% (p=0.000 n=20)
AppendFloat/32NegExp-16              93.70n ± 1%    94.87n ± 1%        ~ (p=0.003 n=20)
AppendFloat/32Shortest-16            83.98n ± 1%    86.45n ± 1%   +2.94% (p=0.000 n=20)
AppendFloat/32Fixed8Hard-16          61.91n ± 1%    57.81n ± 1%   -6.62% (p=0.000 n=20)
AppendFloat/32Fixed9Hard-16          71.08n ± 0%    66.81n ± 1%   -6.01% (p=0.000 n=20)
AppendFloat/64Fixed1-16              59.27n ± 2%    51.49n ± 1%  -13.13% (p=0.000 n=20)
AppendFloat/64Fixed2-16              57.89n ± 1%    50.87n ± 1%  -12.13% (p=0.000 n=20)
AppendFloat/64Fixed2.5-16            61.04n ± 1%    49.40n ± 1%  -19.08% (p=0.000 n=20)
AppendFloat/64Fixed3-16              58.42n ± 1%    52.14n ± 1%  -10.75% (p=0.000 n=20)
AppendFloat/64Fixed4-16              56.52n ± 1%    50.27n ± 1%  -11.07% (p=0.000 n=20)
AppendFloat/64Fixed5Hard-16          97.79n ± 1%    57.86n ± 1%  -40.83% (p=0.000 n=20)
AppendFloat/64Fixed12-16             90.78n ± 1%    83.01n ± 1%   -8.56% (p=0.000 n=20)
AppendFloat/64Fixed16-16             76.11n ± 1%    70.84n ± 0%   -6.92% (p=0.000 n=20)
AppendFloat/64Fixed12Hard-16         73.56n ± 1%    68.98n ± 2%   -6.23% (p=0.000 n=20)
AppendFloat/64Fixed17Hard-16         83.20n ± 1%    79.85n ± 1%   -4.03% (p=0.000 n=20)
AppendFloat/64Fixed18Hard-16         4.947µ ± 1%    4.915µ ± 1%        ~ (p=0.229 n=20)
AppendFloat/64FixedF1-16             242.4n ± 1%    239.4n ± 1%        ~ (p=0.038 n=20)
AppendFloat/64FixedF2-16             257.7n ± 2%    252.6n ± 1%   -1.98% (p=0.000 n=20)
AppendFloat/64FixedF3-16             237.5n ± 0%    237.5n ± 1%        ~ (p=0.440 n=20)
AppendFloat/Slowpath64-16            99.75n ± 1%    99.78n ± 1%        ~ (p=0.995 n=20)
AppendFloat/SlowpathDenormal64-16    97.41n ± 1%    98.20n ± 1%        ~ (p=0.006 n=20)
geomean                              100.7n         95.60n        -5.05%

host: s7
cpu: AMD Ryzen 9 7950X 16-Core Processor
                                  │ 14b7e09f493 │             f9bf7fcb8e2             │
                                  │   sec/op    │   sec/op     vs base                │
AppendFloat/Decimal-32              22.19n ± 0%   22.04n ± 0%   -0.68% (p=0.000 n=20)
AppendFloat/Float-32                34.59n ± 0%   34.88n ± 0%   +0.84% (p=0.000 n=20)
AppendFloat/Exp-32                  34.47n ± 0%   34.88n ± 0%   +1.20% (p=0.000 n=20)
AppendFloat/NegExp-32               34.85n ± 0%   35.32n ± 0%   +1.35% (p=0.000 n=20)
AppendFloat/LongExp-32              37.23n ± 0%   37.09n ± 0%        ~ (p=0.003 n=20)
AppendFloat/Big-32                  39.27n ± 0%   38.50n ± 0%   -1.97% (p=0.000 n=20)
AppendFloat/BinaryExp-32            17.38n ± 0%   17.61n ± 0%   +1.35% (p=0.000 n=20)
AppendFloat/32Integer-32            22.26n ± 0%   22.08n ± 0%   -0.79% (p=0.000 n=20)
AppendFloat/32ExactFraction-32      32.82n ± 0%   32.91n ± 0%        ~ (p=0.018 n=20)
AppendFloat/32Point-32              32.88n ± 0%   33.22n ± 0%   +1.03% (p=0.000 n=20)
AppendFloat/32Exp-32                34.95n ± 0%   34.62n ± 0%   -0.94% (p=0.000 n=20)
AppendFloat/32NegExp-32             33.23n ± 0%   33.55n ± 0%   +0.98% (p=0.000 n=20)
AppendFloat/32Shortest-32           30.19n ± 0%   30.12n ± 0%        ~ (p=0.122 n=20)
AppendFloat/32Fixed8Hard-32         22.94n ± 0%   22.88n ± 0%        ~ (p=0.124 n=20)
AppendFloat/32Fixed9Hard-32         26.20n ± 0%   25.94n ± 1%   -0.97% (p=0.000 n=20)
AppendFloat/64Fixed1-32             21.10n ± 0%   18.84n ± 0%  -10.71% (p=0.000 n=20)
AppendFloat/64Fixed2-32             20.75n ± 0%   18.70n ± 0%   -9.88% (p=0.000 n=20)
AppendFloat/64Fixed2.5-32           21.07n ± 0%   17.96n ± 0%  -14.74% (p=0.000 n=20)
AppendFloat/64Fixed3-32             21.24n ± 0%   19.64n ± 0%   -7.53% (p=0.000 n=20)
AppendFloat/64Fixed4-32             20.63n ± 0%   18.61n ± 0%   -9.79% (p=0.000 n=20)
AppendFloat/64Fixed5Hard-32         34.48n ± 0%   21.70n ± 0%  -37.06% (p=0.000 n=20)
AppendFloat/64Fixed12-32            32.26n ± 0%   30.87n ± 1%   -4.31% (p=0.000 n=20)
AppendFloat/64Fixed16-32            27.95n ± 0%   26.86n ± 0%   -3.92% (p=0.000 n=20)
AppendFloat/64Fixed12Hard-32        27.30n ± 0%   25.98n ± 1%   -4.82% (p=0.000 n=20)
AppendFloat/64Fixed17Hard-32        30.80n ± 0%   29.93n ± 0%   -2.84% (p=0.000 n=20)
AppendFloat/64Fixed18Hard-32        1.833µ ± 0%   1.831µ ± 0%        ~ (p=0.663 n=20)
AppendFloat/64FixedF1-32            83.42n ± 1%   84.00n ± 1%        ~ (p=0.003 n=20)
AppendFloat/64FixedF2-32            90.10n ± 0%   89.23n ± 1%   -0.95% (p=0.001 n=20)
AppendFloat/64FixedF3-32            84.42n ± 1%   84.39n ± 0%        ~ (p=0.878 n=20)
AppendFloat/Slowpath64-32           35.72n ± 0%   35.59n ± 0%        ~ (p=0.007 n=20)
AppendFloat/SlowpathDenormal64-32   35.36n ± 0%   35.05n ± 0%   -0.88% (p=0.000 n=20)
geomean                             36.05n        34.69n        -3.77%

host: linux-386
goarch: 386
cpu: Intel(R) Xeon(R) CPU @ 2.30GHz
                                  │ 14b7e09f493 │             f9bf7fcb8e2             │
                                  │   sec/op    │   sec/op     vs base                │
AppendFloat/Decimal-16              132.8n ± 0%   133.5n ± 0%   +0.49% (p=0.001 n=20)
AppendFloat/Float-16                242.6n ± 0%   241.7n ± 0%   -0.37% (p=0.000 n=20)
AppendFloat/Exp-16                  252.2n ± 0%   249.1n ± 0%   -1.27% (p=0.000 n=20)
AppendFloat/NegExp-16               253.6n ± 0%   247.7n ± 0%   -2.33% (p=0.000 n=20)
AppendFloat/LongExp-16              260.9n ± 0%   257.1n ± 0%   -1.48% (p=0.000 n=20)
AppendFloat/Big-16                  293.7n ± 0%   285.2n ± 0%   -2.89% (p=0.000 n=20)
AppendFloat/BinaryExp-16            89.63n ± 1%   89.06n ± 0%   -0.64% (p=0.000 n=20)
AppendFloat/32Integer-16            132.6n ± 0%   133.2n ± 0%        ~ (p=0.016 n=20)
AppendFloat/32ExactFraction-16      216.9n ± 0%   214.2n ± 0%   -1.24% (p=0.000 n=20)
AppendFloat/32Point-16              205.0n ± 0%   202.2n ± 0%   -1.37% (p=0.000 n=20)
AppendFloat/32Exp-16                250.2n ± 0%   235.9n ± 0%   -5.72% (p=0.000 n=20)
AppendFloat/32NegExp-16             213.5n ± 0%   210.6n ± 0%   -1.34% (p=0.000 n=20)
AppendFloat/32Shortest-16           198.3n ± 0%   197.8n ± 0%        ~ (p=0.147 n=20)
AppendFloat/32Fixed8Hard-16         114.9n ± 1%   136.0n ± 1%  +18.46% (p=0.000 n=20)
AppendFloat/32Fixed9Hard-16         189.8n ± 0%   155.0n ± 1%  -18.31% (p=0.000 n=20)
AppendFloat/64Fixed1-16             175.8n ± 0%   132.7n ± 0%  -24.52% (p=0.000 n=20)
AppendFloat/64Fixed2-16             166.6n ± 0%   128.7n ± 0%  -22.73% (p=0.000 n=20)
AppendFloat/64Fixed2.5-16           176.5n ± 0%   126.8n ± 0%  -28.11% (p=0.000 n=20)
AppendFloat/64Fixed3-16             165.3n ± 0%   127.1n ± 0%  -23.11% (p=0.000 n=20)
AppendFloat/64Fixed4-16             141.3n ± 0%   120.8n ± 1%  -14.51% (p=0.000 n=20)
AppendFloat/64Fixed5Hard-16         344.6n ± 0%   136.0n ± 0%  -60.51% (p=0.000 n=20)
AppendFloat/64Fixed12-16            184.2n ± 0%   158.7n ± 0%  -13.82% (p=0.000 n=20)
AppendFloat/64Fixed16-16            174.0n ± 0%   151.3n ± 0%  -12.99% (p=0.000 n=20)
AppendFloat/64Fixed12Hard-16        169.7n ± 0%   146.7n ± 0%  -13.58% (p=0.000 n=20)
AppendFloat/64Fixed17Hard-16        207.7n ± 0%   166.6n ± 0%  -19.81% (p=0.000 n=20)
AppendFloat/64Fixed18Hard-16        10.66µ ± 0%   10.63µ ± 0%        ~ (p=0.030 n=20)
AppendFloat/64FixedF1-16            615.9n ± 0%   613.5n ± 0%   -0.40% (p=0.000 n=20)
AppendFloat/64FixedF2-16            846.6n ± 0%   847.4n ± 0%        ~ (p=0.551 n=20)
AppendFloat/64FixedF3-16            609.9n ± 0%   609.5n ± 0%        ~ (p=0.213 n=20)
AppendFloat/Slowpath64-16           254.1n ± 0%   252.6n ± 1%        ~ (p=0.048 n=20)
AppendFloat/SlowpathDenormal64-16   251.5n ± 0%   249.4n ± 0%   -0.83% (p=0.000 n=20)
geomean                             249.2n        225.4n        -9.54%

host: s7:GOARCH=386
cpu: AMD Ryzen 9 7950X 16-Core Processor
                                  │ 14b7e09f493 │             f9bf7fcb8e2             │
                                  │   sec/op    │   sec/op     vs base                │
AppendFloat/Decimal-32              42.65n ± 0%   42.31n ± 0%   -0.79% (p=0.000 n=20)
AppendFloat/Float-32                71.56n ± 0%   71.06n ± 0%   -0.69% (p=0.000 n=20)
AppendFloat/Exp-32                  75.61n ± 1%   74.85n ± 1%   -1.01% (p=0.000 n=20)
AppendFloat/NegExp-32               74.36n ± 0%   74.30n ± 0%        ~ (p=0.482 n=20)
AppendFloat/LongExp-32              75.82n ± 0%   75.73n ± 0%        ~ (p=0.490 n=20)
AppendFloat/Big-32                  85.10n ± 0%   82.61n ± 0%   -2.93% (p=0.000 n=20)
AppendFloat/BinaryExp-32            33.02n ± 0%   32.48n ± 1%   -1.64% (p=0.000 n=20)
AppendFloat/32Integer-32            41.54n ± 1%   41.27n ± 1%   -0.66% (p=0.000 n=20)
AppendFloat/32ExactFraction-32      62.48n ± 0%   62.91n ± 0%   +0.69% (p=0.000 n=20)
AppendFloat/32Point-32              60.17n ± 0%   60.65n ± 0%   +0.80% (p=0.000 n=20)
AppendFloat/32Exp-32                73.34n ± 0%   68.99n ± 0%   -5.92% (p=0.000 n=20)
AppendFloat/32NegExp-32             63.29n ± 0%   62.83n ± 0%   -0.73% (p=0.000 n=20)
AppendFloat/32Shortest-32           58.97n ± 0%   59.07n ± 0%        ~ (p=0.029 n=20)
AppendFloat/32Fixed8Hard-32         37.42n ± 0%   41.76n ± 1%  +11.61% (p=0.000 n=20)
AppendFloat/32Fixed9Hard-32         55.18n ± 0%   50.13n ± 1%   -9.16% (p=0.000 n=20)
AppendFloat/64Fixed1-32             50.89n ± 1%   41.25n ± 0%  -18.94% (p=0.000 n=20)
AppendFloat/64Fixed2-32             48.33n ± 1%   40.85n ± 1%  -15.48% (p=0.000 n=20)
AppendFloat/64Fixed2.5-32           52.46n ± 0%   39.39n ± 0%  -24.92% (p=0.000 n=20)
AppendFloat/64Fixed3-32             48.28n ± 1%   40.66n ± 0%  -15.78% (p=0.000 n=20)
AppendFloat/64Fixed4-32             44.57n ± 0%   38.58n ± 0%  -13.44% (p=0.000 n=20)
AppendFloat/64Fixed5Hard-32         96.16n ± 0%   42.99n ± 1%  -55.29% (p=0.000 n=20)
AppendFloat/64Fixed12-32            56.84n ± 0%   51.95n ± 1%   -8.61% (p=0.000 n=20)
AppendFloat/64Fixed16-32            54.23n ± 0%   49.33n ± 0%   -9.03% (p=0.000 n=20)
AppendFloat/64Fixed12Hard-32        53.47n ± 0%   48.67n ± 0%   -8.99% (p=0.000 n=20)
AppendFloat/64Fixed17Hard-32        61.76n ± 0%   55.42n ± 1%  -10.27% (p=0.000 n=20)
AppendFloat/64Fixed18Hard-32        3.998µ ± 1%   4.001µ ± 0%        ~ (p=0.449 n=20)
AppendFloat/64FixedF1-32            161.8n ± 0%   166.2n ± 1%   +2.72% (p=0.000 n=20)
AppendFloat/64FixedF2-32            223.4n ± 2%   226.2n ± 1%   +1.25% (p=0.000 n=20)
AppendFloat/64FixedF3-32            159.6n ± 0%   161.6n ± 1%   +1.22% (p=0.000 n=20)
AppendFloat/Slowpath64-32           76.69n ± 0%   75.03n ± 0%   -2.16% (p=0.000 n=20)
AppendFloat/SlowpathDenormal64-32   75.02n ± 0%   74.36n ± 1%        ~ (p=0.003 n=20)
geomean                             74.66n        69.39n        -7.06%

Change-Id: I9db46471a93bd2aab3c2796e563d154cb531d4cb
Reviewed-on: https://go-review.googlesource.com/c/go/+/717182
Reviewed-by: Alan Donovan <adonovan@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
---

diff --git a/src/internal/strconv/atoi.go b/src/internal/strconv/atoi.go
index 5bc259e7e5..4bbcb4f5da 100644
--- a/src/internal/strconv/atoi.go
+++ b/src/internal/strconv/atoi.go
@@ -41,8 +41,6 @@ const intSize = 32 << (^uint(0) >> 63)
 // IntSize is the size in bits of an int or uint value.
 const IntSize = intSize
 
-const maxUint64 = 1<<64 - 1
-
 // ParseUint is like [ParseInt] but for unsigned numbers.
 //
 // A sign prefix is not permitted.
diff --git a/src/internal/strconv/export_test.go b/src/internal/strconv/export_test.go
index 86435f66cf..c879f24480 100644
--- a/src/internal/strconv/export_test.go
+++ b/src/internal/strconv/export_test.go
@@ -18,6 +18,9 @@ var (
 	Pow10            = pow10
 	Umul128          = umul128
 	Umul192          = umul192
+	Div5Tab          = div5Tab
+	DivisiblePow5    = divisiblePow5
+	TrimZeros        = trimZeros
 )
 
 func NewDecimal(i uint64) *decimal {
diff --git a/src/internal/strconv/ftoa.go b/src/internal/strconv/ftoa.go
index 1aec5447ec..fd30f28289 100644
--- a/src/internal/strconv/ftoa.go
+++ b/src/internal/strconv/ftoa.go
@@ -123,16 +123,17 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte {
 		return bigFtoa(dst, prec, fmt, neg, mant, exp, flt)
 	}
 
-	var digs decimalSlice
-	ok := false
 	// Negative precision means "only as much as needed to be exact."
 	shortest := prec < 0
+	var digs decimalSlice
+	if mant == 0 {
+		return formatDigits(dst, shortest, neg, digs, prec, fmt)
+	}
 	if shortest {
 		// Use Ryu algorithm.
 		var buf [32]byte
 		digs.d = buf[:]
 		ryuFtoaShortest(&digs, mant, exp-int(flt.mantbits), flt)
-		ok = true
 		// Precision for shortest representation mode.
 		switch fmt {
 		case 'e', 'E':
@@ -142,7 +143,11 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte {
 		case 'g', 'G':
 			prec = digs.nd
 		}
-	} else if fmt != 'f' {
+		return formatDigits(dst, shortest, neg, digs, prec, fmt)
+	}
+
+	// TODO figure out when we can use fast code for f
+	if fmt != 'f' {
 		// Fixed number of digits.
 		digits := prec
 		switch fmt {
@@ -157,21 +162,15 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte {
 			// Invalid mode.
 			digits = 1
 		}
-		var buf [24]byte
-		if bitSize == 32 && digits <= 9 {
+		if digits <= 18 {
+			var buf [24]byte
 			digs.d = buf[:]
-			ryuFtoaFixed32(&digs, uint32(mant), exp-int(flt.mantbits), digits)
-			ok = true
-		} else if digits <= 18 {
-			digs.d = buf[:]
-			ryuFtoaFixed64(&digs, mant, exp-int(flt.mantbits), digits)
-			ok = true
+			fixedFtoa(&digs, mant, exp-int(flt.mantbits), digits)
+			return formatDigits(dst, false, neg, digs, prec, fmt)
 		}
 	}
-	if !ok {
-		return bigFtoa(dst, prec, fmt, neg, mant, exp, flt)
-	}
-	return formatDigits(dst, shortest, neg, digs, prec, fmt)
+
+	return bigFtoa(dst, prec, fmt, neg, mant, exp, flt)
 }
 
 // bigFtoa uses multiprecision computations to format a float.
diff --git a/src/internal/strconv/ftoa_test.go b/src/internal/strconv/ftoa_test.go
index d510629537..4e6f462928 100644
--- a/src/internal/strconv/ftoa_test.go
+++ b/src/internal/strconv/ftoa_test.go
@@ -177,6 +177,16 @@ var ftoatests = []ftoaTest{
 	{1.801439850948199e+16, 'g', -1, "1.801439850948199e+16"},
 	{5.960464477539063e-08, 'g', -1, "5.960464477539063e-08"},
 	{1.012e-320, 'g', -1, "1.012e-320"},
+
+	// Cases from TestFtoaRandom that caught bugs in fixedFtoa.
+	{8177880169308380. * (1 << 1), 'e', 14, "1.63557603386168e+16"},
+	{8393378656576888. * (1 << 1), 'e', 15, "1.678675731315378e+16"},
+	{8738676561280626. * (1 << 4), 'e', 16, "1.3981882498049002e+17"},
+	{8291032395191335. / (1 << 30), 'e', 5, "7.72163e+06"},
+
+	// Exercise divisiblePow5 case in fixedFtoa
+	{2384185791015625. * (1 << 12), 'e', 5, "9.76562e+18"},
+	{2384185791015625. * (1 << 13), 'e', 5, "1.95312e+19"},
 }
 
 func TestFtoa(t *testing.T) {
@@ -253,7 +263,7 @@ func TestFtoaRandom(t *testing.T) {
 		shortSlow = FormatFloat(x, 'e', prec, 64)
 		SetOptimize(true)
 		if shortSlow != shortFast {
-			t.Errorf("%b printed as %s, want %s", x, shortFast, shortSlow)
+			t.Errorf("%b printed with %%.%de as %s, want %s", x, prec, shortFast, shortSlow)
 		}
 	}
 }
diff --git a/src/internal/strconv/ftoafixed.go b/src/internal/strconv/ftoafixed.go
new file mode 100644
index 0000000000..f3542d1cf5
--- /dev/null
+++ b/src/internal/strconv/ftoafixed.go
@@ -0,0 +1,156 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strconv
+
+import "math/bits"
+
+var uint64pow10 = [...]uint64{
+	1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
+	1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+}
+
+// fixedFtoa formats a number of decimal digits of mant*(2^exp) into d,
+// where mant > 0 and 1 â¤ digits â¤ 18.
+func fixedFtoa(d *decimalSlice, mant uint64, exp, digits int) {
+	// The strategy here is to multiply (mant * 2^exp) by a power of 10
+	// to make the resulting integer be the number of digits we want.
+	//
+	// Adams proved in the Ryu paper that 128-bit precision in the
+	// power-of-10 constant is sufficient to produce correctly
+	// rounded output for all float64s, up to 18 digits.
+	// https://dl.acm.org/doi/10.1145/3192366.3192369
+	//
+	// TODO(rsc): The paper is not focused on, nor terribly clear about,
+	// this fact in this context, and the proof seems too complicated.
+	// Post a shorter, more direct proof and link to it here.
+
+	if digits > 18 {
+		panic("fixedFtoa called with digits > 18")
+	}
+
+	// Shift mantissa to have 64 bits,
+	// so that the 192-bit product below will
+	// have at least 63 bits in its top word.
+	b := 64 - bits.Len64(mant)
+	mant <<= b
+	exp -= b
+
+	// We have f = mant * 2^exp â¥ 2^(63+exp)
+	// and we want to multiply it by some 10^p
+	// to make it have the number of digits plus one rounding bit:
+	//
+	//	2 * 10^(digits-1) â¤ f * 10^p < ~2 * 10^digits
+	//
+	// The lower bound is required, but the upper bound is approximate:
+	// we must not have too few digits, but we can round away extra ones.
+	//
+	//	f * 10^p â¥ 2 * 10^(digits-1)
+	//	10^p â¥ 2 * 10^(digits-1) / f                         [dividing by f]
+	//	p â¥ (logââ 2) + (digits-1) - logââ f                 [taking logââ]
+	//	p â¥ (logââ 2) + (digits-1) - logââ (mant * 2^exp)    [expanding f]
+	//	p â¥ (logââ 2) + (digits-1) - (logââ 2) * (64 + exp)  [mant < 2â¶â´]
+	//	p â¥ (digits - 1) - (logââ 2) * (63 + exp)            [refactoring]
+	//
+	// Once we have p, we can compute the scaled value:
+	//
+	//	dm * 2^de = mant * 2^exp * 10^p
+	//	          = mant * 2^exp * pow/2^128 * 2^exp2.
+	//	          = (mant * pow/2^128) * 2^(exp+exp2).
+	p := (digits - 1) - mulLog10_2(63+exp)
+	pow, exp2, ok := pow10(p)
+	if !ok {
+		// This never happens due to the range of float32/float64 exponent
+		panic("fixedFtoa: pow10 out of range")
+	}
+	if -22 <= p && p < 0 {
+		// Special case: Let q=-p. q is in [1,22]. We are dividing by 10^q
+		// and the mantissa may be a multiple of 5^q (5^22 < 2^53),
+		// in which case the division must be computed exactly and
+		// recorded as exact for correct rounding. Our normal computation is:
+		//
+		//	dm = floor(mant * floor(10^p * 2^s))
+		//
+		// for some scaling shift s. To make this an exact division,
+		// it suffices to change the inner floor to a ceil:
+		//
+		//	dm = floor(mant * ceil(10^p * 2^s))
+		//
+		// In the range of values we are using, the floor and ceil
+		// cancel each other out and the high 64 bits of the product
+		// come out exactly right.
+		// (This is the same trick compilers use for division by constants.
+		// See Hacker's Delight, 2nd ed., Chapter 10.)
+		pow.Lo++
+	}
+	dm, lo1, lo0 := umul192(mant, pow)
+	de := exp + exp2
+
+	// Check whether any bits have been truncated from dm.
+	// If so, set dt != 0. If not, leave dt == 0 (meaning dm is exact).
+	var dt uint
+	switch {
+	default:
+		// Most powers of 10 use a truncated constant,
+		// meaning the result is also truncated.
+		dt = 1
+	case 0 <= p && p <= 55:
+		// Small positive powers of 10 (up to 10âµâµ) can be represented
+		// precisely in a 128-bit mantissa (5âµâµ â¤ 2Â¹Â²â¸), so the only truncation
+		// comes from discarding the low bits of the 192-bit product.
+		//
+		// TODO(rsc): The new proof mentioned above should also
+		// prove that we can't have lo1 == 0 and lo0 != 0.
+		// After proving that, drop computation and use of lo0 here.
+		dt = bool2uint(lo1|lo0 != 0)
+	case -22 <= p && p < 0 && divisiblePow5(mant, -p):
+		// If the original mantissa was a multiple of 5^p,
+		// the result is exact. (See comment above for pow.Lo++.)
+		dt = 0
+	}
+
+	// The value we want to format is dm * 2^de, where de < 0.
+	// Multply by 2^de by shifting, but leave one extra bit for rounding.
+	// After the shift, the "integer part" of dm is dm>>1,
+	// the "rounding bit" (the first fractional bit) is dm&1,
+	// and the "truncated bit" (have any bits been discarded?) is dt.
+	shift := -de - 1
+	dt |= bool2uint(dm&(1<<shift-1) != 0)
+	dm >>= shift
+
+	// Set decimal point in eventual formatted digits,
+	// so we can update it as we adjust the digits.
+	d.dp = digits - p
+
+	// Trim excess digit if any, updating truncation and decimal point.
+	// The << 1 is leaving room for the rounding bit.
+	max := uint64pow10[digits] << 1
+	if dm >= max {
+		var r uint
+		dm, r = dm/10, uint(dm%10)
+		dt |= bool2uint(r != 0)
+		d.dp++
+	}
+
+	// Round and shift away rounding bit.
+	// We want to round up when
+	// (a) the fractional part is > 0.5 (dm&1 != 0 and dt == 1)
+	// (b) or the fractional part is â¥ 0.5 and the integer part is odd
+	//     (dm&1 != 0 and dm&2 != 0).
+	// The bitwise expression encodes that logic.
+	dm += uint64(uint(dm) & (dt | uint(dm)>>1) & 1)
+	dm >>= 1
+	if dm == max>>1 {
+		// 999... rolled over to 1000...
+		dm = uint64pow10[digits-1]
+		d.dp++
+	}
+
+	// Format digits into d.
+	formatBase10(d.d[:digits], dm)
+	d.nd = digits
+	for d.d[d.nd-1] == '0' {
+		d.nd--
+	}
+}
diff --git a/src/internal/strconv/ftoaryu.go b/src/internal/strconv/ftoaryu.go
index 999af51502..9407bfec44 100644
--- a/src/internal/strconv/ftoaryu.go
+++ b/src/internal/strconv/ftoaryu.go
@@ -4,203 +4,11 @@
 
 package strconv
 
-import (
-	"math/bits"
-)
+import "math/bits"
 
 // binary to decimal conversion using the RyÅ« algorithm.
 //
 // See Ulf Adams, "RyÅ«: Fast Float-to-String Conversion" (doi:10.1145/3192366.3192369)
-//
-// Fixed precision formatting is a variant of the original paper's
-// algorithm, where a single multiplication by 10^k is required,
-// sharing the same rounding guarantees.
-
-// ryuFtoaFixed32 formats mant*(2^exp) with prec decimal digits.
-func ryuFtoaFixed32(d *decimalSlice, mant uint32, exp int, prec int) {
-	if prec < 0 {
-		panic("ryuFtoaFixed32 called with negative prec")
-	}
-	if prec > 9 {
-		panic("ryuFtoaFixed32 called with prec > 9")
-	}
-	// Zero input.
-	if mant == 0 {
-		d.nd, d.dp = 0, 0
-		return
-	}
-	// Renormalize to a 25-bit mantissa.
-	e2 := exp
-	if b := bits.Len32(mant); b < 25 {
-		mant <<= uint(25 - b)
-		e2 += b - 25
-	}
-	// Choose an exponent such that rounded mant*(2^e2)*(10^q) has
-	// at least prec decimal digits, i.e
-	//     mant*(2^e2)*(10^q) >= 10^(prec-1)
-	// Because mant >= 2^24, it is enough to choose:
-	//     2^(e2+24) >= 10^(-q+prec-1)
-	// or q = -mulLog10_2(e2+24) + prec - 1
-	q := -mulLog10_2(e2+24) + prec - 1
-
-	// Now compute mant*(2^e2)*(10^q).
-	// Is it an exact computation?
-	// Only small positive powers of 10 are exact (5^28 has 66 bits).
-	exact := q <= 27 && q >= 0
-
-	di, dexp2, d0 := mult64bitPow10(mant, e2, q)
-	if dexp2 >= 0 {
-		panic("not enough significant bits after mult64bitPow10")
-	}
-	// As a special case, computation might still be exact, if exponent
-	// was negative and if it amounts to computing an exact division.
-	// In that case, we ignore all lower bits.
-	// Note that division by 10^11 cannot be exact as 5^11 has 26 bits.
-	if q < 0 && q >= -10 && divisibleByPower5(uint64(mant), -q) {
-		exact = true
-		d0 = true
-	}
-	// Remove extra lower bits and keep rounding info.
-	extra := uint(-dexp2)
-	extraMask := uint32(1<<extra - 1)
-
-	di, dfrac := di>>extra, di&extraMask
-	roundUp := false
-	if exact {
-		// If we computed an exact product, d + 1/2
-		// should round to d+1 if 'd' is odd.
-		roundUp = dfrac > 1<<(extra-1) ||
-			(dfrac == 1<<(extra-1) && !d0) ||
-			(dfrac == 1<<(extra-1) && d0 && di&1 == 1)
-	} else {
-		// otherwise, d+1/2 always rounds up because
-		// we truncated below.
-		roundUp = dfrac>>(extra-1) == 1
-	}
-	if dfrac != 0 {
-		d0 = false
-	}
-	// Proceed to the requested number of digits
-	formatDecimal(d, uint64(di), !d0, roundUp, prec)
-	// Adjust exponent
-	d.dp -= q
-}
-
-// ryuFtoaFixed64 formats mant*(2^exp) with prec decimal digits.
-func ryuFtoaFixed64(d *decimalSlice, mant uint64, exp int, prec int) {
-	if prec > 18 {
-		panic("ryuFtoaFixed64 called with prec > 18")
-	}
-	// Zero input.
-	if mant == 0 {
-		d.nd, d.dp = 0, 0
-		return
-	}
-	// Renormalize to a 55-bit mantissa.
-	e2 := exp
-	if b := bits.Len64(mant); b < 55 {
-		mant = mant << uint(55-b)
-		e2 += b - 55
-	}
-	// Choose an exponent such that rounded mant*(2^e2)*(10^q) has
-	// at least prec decimal digits, i.e
-	//     mant*(2^e2)*(10^q) >= 10^(prec-1)
-	// Because mant >= 2^54, it is enough to choose:
-	//     2^(e2+54) >= 10^(-q+prec-1)
-	// or q = -mulLog10_2(e2+54) + prec - 1
-	//
-	// The minimal required exponent is -mulLog10_2(1025)+18 = -291
-	// The maximal required exponent is mulLog10_2(1074)+18 = 342
-	q := -mulLog10_2(e2+54) + prec - 1
-
-	// Now compute mant*(2^e2)*(10^q).
-	// Is it an exact computation?
-	// Only small positive powers of 10 are exact (5^55 has 128 bits).
-	exact := q <= 55 && q >= 0
-
-	di, dexp2, d0 := mult128bitPow10(mant, e2, q)
-	if dexp2 >= 0 {
-		panic("not enough significant bits after mult128bitPow10")
-	}
-	// As a special case, computation might still be exact, if exponent
-	// was negative and if it amounts to computing an exact division.
-	// In that case, we ignore all lower bits.
-	// Note that division by 10^23 cannot be exact as 5^23 has 54 bits.
-	if q < 0 && q >= -22 && divisibleByPower5(mant, -q) {
-		exact = true
-		d0 = true
-	}
-	// Remove extra lower bits and keep rounding info.
-	extra := uint(-dexp2)
-	extraMask := uint64(1<<extra - 1)
-
-	di, dfrac := di>>extra, di&extraMask
-	roundUp := false
-	if exact {
-		// If we computed an exact product, d + 1/2
-		// should round to d+1 if 'd' is odd.
-		roundUp = dfrac > 1<<(extra-1) ||
-			(dfrac == 1<<(extra-1) && !d0) ||
-			(dfrac == 1<<(extra-1) && d0 && di&1 == 1)
-	} else {
-		// otherwise, d+1/2 always rounds up because
-		// we truncated below.
-		roundUp = dfrac>>(extra-1) == 1
-	}
-	if dfrac != 0 {
-		d0 = false
-	}
-	// Proceed to the requested number of digits
-	formatDecimal(d, di, !d0, roundUp, prec)
-	// Adjust exponent
-	d.dp -= q
-}
-
-var uint64pow10 = [...]uint64{
-	1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
-	1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
-}
-
-// formatDecimal fills d with at most prec decimal digits
-// of mantissa m. The boolean trunc indicates whether m
-// is truncated compared to the original number being formatted.
-func formatDecimal(d *decimalSlice, m uint64, trunc bool, roundUp bool, prec int) {
-	max := uint64pow10[prec]
-	trimmed := 0
-	for m >= max {
-		a, b := m/10, m%10
-		m = a
-		trimmed++
-		if b > 5 {
-			roundUp = true
-		} else if b < 5 {
-			roundUp = false
-		} else { // b == 5
-			// round up if there are trailing digits,
-			// or if the new value of m is odd (round-to-even convention)
-			roundUp = trunc || m&1 == 1
-		}
-		if b != 0 {
-			trunc = true
-		}
-	}
-	if roundUp {
-		m++
-	}
-	if m >= max {
-		// Happens if di was originally 99999....xx
-		m /= 10
-		trimmed++
-	}
-	// render digits
-	formatBase10(d.d[:prec], m)
-	d.nd = prec
-	for d.d[d.nd-1] == '0' {
-		d.nd--
-		trimmed++
-	}
-	d.dp = d.nd + trimmed
-}
 
 // ryuFtoaShortest formats mant*2^exp with prec decimal digits.
 func ryuFtoaShortest(d *decimalSlice, mant uint64, exp int, flt *floatInfo) {
@@ -249,13 +57,13 @@ func ryuFtoaShortest(d *decimalSlice, mant uint64, exp int, flt *floatInfo) {
 	if q < 0 && q >= -24 {
 		// Division by a power of ten may be exact.
 		// (note that 5^25 is a 59-bit number so division by 5^25 is never exact).
-		if divisibleByPower5(ml, -q) {
+		if divisiblePow5(ml, -q) {
 			dl0 = true
 		}
-		if divisibleByPower5(mc, -q) {
+		if divisiblePow5(mc, -q) {
 			dc0 = true
 		}
-		if divisibleByPower5(mu, -q) {
+		if divisiblePow5(mu, -q) {
 			du0 = true
 		}
 	}
@@ -497,16 +305,3 @@ func mult128bitPow10(m uint64, e2, q int) (resM uint64, resE int, exact bool) {
 	hi, mid, lo := umul192(m, pow)
 	return hi<<9 | mid>>55, e2, mid<<9 == 0 && lo == 0
 }
-
-func divisibleByPower5(m uint64, k int) bool {
-	if m == 0 {
-		return true
-	}
-	for i := 0; i < k; i++ {
-		if m%5 != 0 {
-			return false
-		}
-		m /= 5
-	}
-	return true
-}
diff --git a/src/internal/strconv/import_test.go b/src/internal/strconv/import_test.go
index ed1015ee5d..3dab2bf9e5 100644
--- a/src/internal/strconv/import_test.go
+++ b/src/internal/strconv/import_test.go
@@ -20,4 +20,7 @@ var (
 	pow10            = Pow10
 	umul128          = Umul128
 	umul192          = Umul192
+	div5Tab          = Div5Tab
+	divisiblePow5    = DivisiblePow5
+	trimZeros        = TrimZeros
 )
diff --git a/src/internal/strconv/math.go b/src/internal/strconv/math.go
index 37303d76db..3b884e846a 100644
--- a/src/internal/strconv/math.go
+++ b/src/internal/strconv/math.go
@@ -56,3 +56,124 @@ func mulLog2_10(x int) int {
 	// log(10)/log(2) â 3.32192809489 â 108853 / 2^15
 	return (x * 108853) >> 15
 }
+
+func bool2uint(b bool) uint {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+// Exact Division and Remainder Checking
+//
+// An exact division x/c (exact means x%c == 0)
+// can be implemented by x*m where m is the multiplicative inverse of c (m*c == 1).
+//
+// Since c is also the multiplicative inverse of m, x*m is lossless,
+// and all the exact multiples of c map to all of [0, maxUint64/c].
+// The non-multiples are forced to map to larger values.
+// This also gives a quick test for whether x is an exact multiple of c:
+// compute the exact division and check whether it's at most maxUint64/c:
+//	x%c == 0 => x*m <= maxUint64/c.
+//
+// Only odd c have multiplicative inverses mod powers of two.
+// To do an exact divide x / (c<<s) we can use (x/c)>>s instead.
+// And to check for remainder, we need to check that those low s
+// bits are all zero before we shift them away. We can merge that
+// with the <= for the exact odd remainder check by rotating the
+// shifted bits into the high part instead:
+// 	x%(c<<s) == 0 => bits.RotateLeft64(x*m, -s) <= maxUint64/c.
+//
+// The compiler does this transformation automatically in general,
+// but we apply it here by hand in a few ways that the compiler can't help with.
+//
+// For a more detailed explanation, see
+// Henry S. Warren, Jr., Hacker's Delight, 2nd ed., sections 10-16 and 10-17.
+
+// divisiblePow5 reports whether x is divisible by 5^p.
+// It returns false for p not in [1, 22],
+// because we only care about float64 mantissas, and 5^23 > 2^53.
+func divisiblePow5(x uint64, p int) bool {
+	return 1 <= p && p <= 22 && x*div5Tab[p-1][0] <= div5Tab[p-1][1]
+}
+
+const maxUint64 = 1<<64 - 1
+
+// div5Tab[p-1] is the multiplicative inverse of 5^p and maxUint64/5^p.
+var div5Tab = [22][2]uint64{
+	{0xcccccccccccccccd, maxUint64 / 5},
+	{0x8f5c28f5c28f5c29, maxUint64 / 5 / 5},
+	{0x1cac083126e978d5, maxUint64 / 5 / 5 / 5},
+	{0xd288ce703afb7e91, maxUint64 / 5 / 5 / 5 / 5},
+	{0x5d4e8fb00bcbe61d, maxUint64 / 5 / 5 / 5 / 5 / 5},
+	{0x790fb65668c26139, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0xe5032477ae8d46a5, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0xc767074b22e90e21, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x8e47ce423a2e9c6d, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x4fa7f60d3ed61f49, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x0fee64690c913975, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x3662e0e1cf503eb1, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0xa47a2cf9f6433fbd, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x54186f653140a659, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x7738164770402145, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0xe4a4d1417cd9a041, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0xc75429d9e5c5200d, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0xc1773b91fac10669, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x26b172506559ce15, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0xd489e3a9addec2d1, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x90e860bb892c8d5d, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+	{0x502e79bf1b6f4f79, maxUint64 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5 / 5},
+}
+
+// trimZeros trims trailing zeros from x.
+// It finds the largest p such that x % 10^p == 0
+// and then returns x / 10^p, p.
+//
+// This is here for reference and tested, because it is an optimization
+// used by other ftoa algorithms, but in our implementations it has
+// never been benchmarked to be faster than trimming zeros after
+// formatting into decimal bytes.
+func trimZeros(x uint64) (uint64, int) {
+	const (
+		div1e8m  = 0xc767074b22e90e21
+		div1e8le = maxUint64 / 100000000
+
+		div1e4m  = 0xd288ce703afb7e91
+		div1e4le = maxUint64 / 10000
+
+		div1e2m  = 0x8f5c28f5c28f5c29
+		div1e2le = maxUint64 / 100
+
+		div1e1m  = 0xcccccccccccccccd
+		div1e1le = maxUint64 / 10
+	)
+
+	// _ = assert[x - y] asserts at compile time that x == y.
+	// Assert that the multiplicative inverses are correct
+	// by checking that (div1eNm * 5^N) % 1<<64 == 1.
+	var assert [1]struct{}
+	_ = assert[(div1e8m*5*5*5*5*5*5*5*5)%(1<<64)-1]
+	_ = assert[(div1e4m*5*5*5*5)%(1<<64)-1]
+	_ = assert[(div1e2m*5*5)%(1<<64)-1]
+	_ = assert[(div1e1m*5)%(1<<64)-1]
+
+	// Cut 8 zeros, then 4, then 2, then 1.
+	p := 0
+	for d := bits.RotateLeft64(x*div1e8m, -8); d <= div1e8le; d = bits.RotateLeft64(x*div1e8m, -8) {
+		x = d
+		p += 8
+	}
+	if d := bits.RotateLeft64(x*div1e4m, -4); d <= div1e4le {
+		x = d
+		p += 4
+	}
+	if d := bits.RotateLeft64(x*div1e2m, -2); d <= div1e2le {
+		x = d
+		p += 2
+	}
+	if d := bits.RotateLeft64(x*div1e1m, -1); d <= div1e1le {
+		x = d
+		p += 1
+	}
+	return x, p
+}
diff --git a/src/internal/strconv/math_test.go b/src/internal/strconv/math_test.go
index 3a1ff3400c..55e25f98cf 100644
--- a/src/internal/strconv/math_test.go
+++ b/src/internal/strconv/math_test.go
@@ -93,3 +93,73 @@ func TestMulLog2_10(t *testing.T) {
 		}
 	}
 }
+
+func pow5(p int) uint64 {
+	x := uint64(1)
+	for range p {
+		x *= 5
+	}
+	return x
+}
+
+func TestDivisiblePow5(t *testing.T) {
+	for p := 1; p <= 22; p++ {
+		x := pow5(p)
+		if divisiblePow5(1, p) {
+			t.Errorf("divisiblePow5(1, %d) = true, want, false", p)
+		}
+		if divisiblePow5(x-1, p) {
+			t.Errorf("divisiblePow5(%d, %d) = true, want false", x-1, p)
+		}
+		if divisiblePow5(x+1, p) {
+			t.Errorf("divisiblePow5(%d, %d) = true, want false", x-1, p)
+		}
+		if divisiblePow5(x/5, p) {
+			t.Errorf("divisiblePow5(%d, %d) = true, want false", x/5, p)
+		}
+		if !divisiblePow5(0, p) {
+			t.Errorf("divisiblePow5(0, %d) = false, want true", p)
+		}
+		if !divisiblePow5(x, p) {
+			t.Errorf("divisiblePow5(%d, %d) = false, want true", x, p)
+		}
+		if 2*x > x && !divisiblePow5(2*x, p) {
+			t.Errorf("divisiblePow5(%d, %d) = false, want true", 2*x, p)
+		}
+	}
+}
+
+func TestDiv5Tab(t *testing.T) {
+	for p := 1; p <= 22; p++ {
+		m := div5Tab[p-1][0]
+		le := div5Tab[p-1][1]
+
+		// See comment in math.go on div5Tab.
+		// m needs to be multiplicative inverse of pow5(p).
+		if m*pow5(p) != 1 {
+			t.Errorf("pow5Tab[%d-1][0] = %#x, but %#x * (5**%d) = %d, want 1", p, m, m, p, m*pow5(p))
+		}
+
+		// le needs to be â(1<<64 - 1) / 5^pâ.
+		want := (1<<64 - 1) / pow5(p)
+		if le != want {
+			t.Errorf("pow5Tab[%d-1][1] = %#x, want %#x", p, le, want)
+		}
+	}
+}
+
+func TestTrimZeros(t *testing.T) {
+	for _, x := range []uint64{1, 2, 3, 4, 101, 123} {
+		want := x
+		for p := range 20 {
+			haveX, haveP := trimZeros(x)
+			if haveX != want || haveP != p {
+				t.Errorf("trimZeros(%d) = %d, %d, want %d, %d", x, haveX, haveP, want, p)
+			}
+			if x >= (1<<64-1)/10 {
+				break
+			}
+			x *= 10
+		}
+	}
+}