From: Russ Cox <rsc@golang.org>
Date: Tue, 4 Nov 2025 03:09:48 +0000 (-0500)
Subject: cmd/compile: implement Avg64u, Hmul64, Hmul64u for wasm
X-Git-Tag: go1.26rc1~382
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6e165b4d17;p=gostls13.git

cmd/compile: implement Avg64u, Hmul64, Hmul64u for wasm

This lets us remove useAvg and useHmul from the division rules.
The compiler is simpler and the generated code is faster.

goos: wasip1
goarch: wasm
pkg: internal/strconv
                               │   old.txt   │               new.txt               │
                               │   sec/op    │   sec/op     vs base                │
AppendFloat/Decimal              192.8n ± 1%   194.6n ± 0%   +0.91% (p=0.000 n=10)
AppendFloat/Float                328.6n ± 0%   279.6n ± 0%  -14.93% (p=0.000 n=10)
AppendFloat/Exp                  335.6n ± 1%   289.2n ± 1%  -13.80% (p=0.000 n=10)
AppendFloat/NegExp               336.0n ± 0%   289.1n ± 1%  -13.97% (p=0.000 n=10)
AppendFloat/LongExp              332.4n ± 0%   285.2n ± 1%  -14.20% (p=0.000 n=10)
AppendFloat/Big                  348.2n ± 0%   300.1n ± 0%  -13.83% (p=0.000 n=10)
AppendFloat/BinaryExp            137.4n ± 0%   138.2n ± 0%   +0.55% (p=0.001 n=10)
AppendFloat/32Integer            193.3n ± 1%   196.5n ± 0%   +1.66% (p=0.000 n=10)
AppendFloat/32ExactFraction      283.3n ± 0%   268.9n ± 1%   -5.08% (p=0.000 n=10)
AppendFloat/32Point              279.9n ± 0%   266.5n ± 0%   -4.80% (p=0.000 n=10)
AppendFloat/32Exp                300.1n ± 0%   288.3n ± 1%   -3.90% (p=0.000 n=10)
AppendFloat/32NegExp             288.2n ± 1%   277.9n ± 1%   -3.59% (p=0.000 n=10)
AppendFloat/32Shortest           261.7n ± 0%   250.2n ± 0%   -4.39% (p=0.000 n=10)
AppendFloat/32Fixed8Hard         173.3n ± 1%   158.9n ± 1%   -8.31% (p=0.000 n=10)
AppendFloat/32Fixed9Hard         180.0n ± 0%   167.9n ± 2%   -6.70% (p=0.000 n=10)
AppendFloat/64Fixed1             167.1n ± 0%   149.6n ± 1%  -10.50% (p=0.000 n=10)
AppendFloat/64Fixed2             162.4n ± 1%   146.5n ± 0%   -9.73% (p=0.000 n=10)
AppendFloat/64Fixed2.5           165.5n ± 0%   149.4n ± 1%   -9.70% (p=0.000 n=10)
AppendFloat/64Fixed3             166.4n ± 1%   150.2n ± 0%   -9.74% (p=0.000 n=10)
AppendFloat/64Fixed4             163.7n ± 0%   149.6n ± 1%   -8.62% (p=0.000 n=10)
AppendFloat/64Fixed5Hard         182.8n ± 1%   167.1n ± 1%   -8.61% (p=0.000 n=10)
AppendFloat/64Fixed12            222.2n ± 0%   208.8n ± 0%   -6.05% (p=0.000 n=10)
AppendFloat/64Fixed16            197.6n ± 1%   181.7n ± 0%   -8.02% (p=0.000 n=10)
AppendFloat/64Fixed12Hard        194.5n ± 0%   181.0n ± 0%   -6.99% (p=0.000 n=10)
AppendFloat/64Fixed17Hard        205.1n ± 1%   191.9n ± 0%   -6.44% (p=0.000 n=10)
AppendFloat/64Fixed18Hard        6.269µ ± 0%   6.643µ ± 0%   +5.97% (p=0.000 n=10)
AppendFloat/64FixedF1            211.7n ± 1%   197.0n ± 0%   -6.95% (p=0.000 n=10)
AppendFloat/64FixedF2            189.4n ± 0%   174.2n ± 0%   -8.08% (p=0.000 n=10)
AppendFloat/64FixedF3            169.0n ± 0%   154.9n ± 0%   -8.32% (p=0.000 n=10)
AppendFloat/Slowpath64           321.2n ± 0%   274.2n ± 1%  -14.63% (p=0.000 n=10)
AppendFloat/SlowpathDenormal64   307.4n ± 1%   261.2n ± 0%  -15.03% (p=0.000 n=10)
AppendInt                        3.367µ ± 1%   3.376µ ± 0%        ~ (p=0.517 n=10)
AppendUint                       675.5n ± 0%   676.9n ± 0%        ~ (p=0.196 n=10)
AppendIntSmall                   28.13n ± 1%   28.17n ± 0%   +0.14% (p=0.015 n=10)
AppendUintVarlen/digits=1        20.70n ± 0%   20.51n ± 1%   -0.89% (p=0.018 n=10)
AppendUintVarlen/digits=2        20.43n ± 0%   20.27n ± 0%   -0.81% (p=0.001 n=10)
AppendUintVarlen/digits=3        38.48n ± 0%   37.93n ± 0%   -1.43% (p=0.000 n=10)
AppendUintVarlen/digits=4        41.10n ± 0%   38.78n ± 1%   -5.62% (p=0.000 n=10)
AppendUintVarlen/digits=5        42.25n ± 1%   42.11n ± 0%   -0.32% (p=0.041 n=10)
AppendUintVarlen/digits=6        45.40n ± 1%   43.14n ± 0%   -4.98% (p=0.000 n=10)
AppendUintVarlen/digits=7        46.81n ± 1%   46.03n ± 0%   -1.66% (p=0.000 n=10)
AppendUintVarlen/digits=8        48.88n ± 1%   46.59n ± 1%   -4.68% (p=0.000 n=10)
AppendUintVarlen/digits=9        49.94n ± 2%   49.41n ± 1%   -1.06% (p=0.000 n=10)
AppendUintVarlen/digits=10       57.28n ± 1%   56.92n ± 1%   -0.62% (p=0.045 n=10)
AppendUintVarlen/digits=11       60.09n ± 1%   58.11n ± 2%   -3.30% (p=0.000 n=10)
AppendUintVarlen/digits=12       62.22n ± 0%   61.85n ± 0%   -0.59% (p=0.000 n=10)
AppendUintVarlen/digits=13       64.94n ± 0%   62.92n ± 0%   -3.10% (p=0.000 n=10)
AppendUintVarlen/digits=14       65.42n ± 1%   65.19n ± 1%   -0.34% (p=0.005 n=10)
AppendUintVarlen/digits=15       68.17n ± 0%   66.13n ± 0%   -2.99% (p=0.000 n=10)
AppendUintVarlen/digits=16       70.21n ± 1%   70.09n ± 1%        ~ (p=0.517 n=10)
AppendUintVarlen/digits=17       72.93n ± 0%   70.49n ± 0%   -3.34% (p=0.000 n=10)
AppendUintVarlen/digits=18       73.01n ± 0%   72.75n ± 0%   -0.35% (p=0.000 n=10)
AppendUintVarlen/digits=19       79.27n ± 1%   79.49n ± 1%        ~ (p=0.671 n=10)
AppendUintVarlen/digits=20       82.18n ± 0%   80.43n ± 1%   -2.14% (p=0.000 n=10)
geomean                          143.4n        136.0n        -5.20%


Change-Id: I8245814a0259ad13cf9225f57db8e9fe3d2e4267
Reviewed-on: https://go-review.googlesource.com/c/go/+/717407
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
---

diff --git a/src/cmd/compile/internal/ssa/_gen/Wasm.rules b/src/cmd/compile/internal/ssa/_gen/Wasm.rules
index f632a01109..6028152253 100644
--- a/src/cmd/compile/internal/ssa/_gen/Wasm.rules
+++ b/src/cmd/compile/internal/ssa/_gen/Wasm.rules
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+(Last ___) => v.Args[len(v.Args)-1]
+
 // Lowering arithmetic
 (Add(64|32|16|8|Ptr) ...) => (I64Add ...)
 (Add(64|32)F ...) => (F(64|32)Add ...)
@@ -44,6 +46,37 @@
 
 (Not ...) => (I64Eqz ...)
 
+(Avg64u x y) => (I64Add (I64ShrU (I64Sub x y) (I64Const [1])) y)
+
+// High word of multiply without carry bits; see Hacker's Delight, 2nd. ed, Figure 8-2, p. 174.
+(Hmul64 <t> x y) =>
+	(Last <t>
+		x0: (ZeroExt32to64 x)
+		x1: (I64ShrS x (I64Const [32]))
+		y0: (ZeroExt32to64 y)
+		y1: (I64ShrS y (I64Const [32]))
+		x0y0: (I64Mul x0 y0)
+		tt: (I64Add (I64Mul x1 y0) (I64ShrU x0y0 (I64Const [32])))
+		w1: (I64Add (I64Mul x0 y1) (ZeroExt32to64 tt))
+		w2: (I64ShrS tt (I64Const [32]))
+		(I64Add (I64Add (I64Mul x1 y1) w2) (I64ShrS w1 (I64Const [32]))))
+
+// Same as Hmul64 but signed shifts now unsigned.
+(Hmul64u <t> x y) =>
+	(Last <t>
+		x0: (ZeroExt32to64 x)
+		x1: (I64ShrU x (I64Const [32]))
+		y0: (ZeroExt32to64 y)
+		y1: (I64ShrU y (I64Const [32]))
+		w0: (I64Mul x0 y0)
+		tt: (I64Add (I64Mul x1 y0) (I64ShrU w0 (I64Const [32])))
+		w1: (I64Add (I64Mul x0 y1) (ZeroExt32to64 tt))
+		w2: (I64ShrU tt (I64Const [32]))
+		hi: (I64Add (I64Add (I64Mul x1 y1) w2) (I64ShrU w1 (I64Const [32]))))
+
+(Select0 <t> (Mul64uhilo x y)) => (Hmul64u <t> x y)
+(Select1 <t> (Mul64uhilo x y)) => (I64Mul x y)
+
 // Lowering pointer arithmetic
 (OffPtr ...) => (I64AddConst ...)
 
diff --git a/src/cmd/compile/internal/ssa/_gen/divmod.rules b/src/cmd/compile/internal/ssa/_gen/divmod.rules
index 21e0a19406..7dd7d245bd 100644
--- a/src/cmd/compile/internal/ssa/_gen/divmod.rules
+++ b/src/cmd/compile/internal/ssa/_gen/divmod.rules
@@ -79,17 +79,9 @@
 //     The magic number m for c is â2^k/câ, so we can use
 //     (m+1)/2 = â2^k/(c/2)â instead.
 //
-//  8. An unsigned divide on systems with an avg instruction.
+//  8. A general unsigned divide using an avg instruction.
 //     We noted above that (x*((1<<N)+m))>>N>>s = ((x*m)>>N+x)>>s.
 //     Let hi = (x*m)>>N, so we want (hi+x) >> s = avg(hi, x) >> (s-1).
-//
-//  9. Unsigned 64-bit divide by 16-bit constant on 32-bit systems.
-//     Use long division with 16-bit digits.
-//
-// Note: All systems have Hmul and Avg except for wasm, and the
-// wasm JITs may well apply all these optimizations already anyway,
-// so it may be worth looking into avoiding this pass entirely on wasm
-// and dropping all the useAvg useHmul uncertainty.
 
 // Case 1. Signed divides where 2N â¤ register size.
 (Div8  <t> x (Const8  [c])) && smagicOK8(c) =>
@@ -112,13 +104,13 @@
     (Rsh64x64 <t> (SignExt32to64 x) (Const64 <typ.UInt64> [63])))
 
 // Case 2. Signed divides where m is even.
-(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 && config.useHmul =>
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 =>
   (Sub32 <t>
     (Rsh32x64 <t>
       (Hmul32 <t> x (Const32 <typ.UInt32> [int32(smagic32(c).m/2)]))
       (Const64 <typ.UInt64> [smagic32(c).s - 1]))
     (Rsh32x64 <t> x (Const64 <typ.UInt64> [31])))
-(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 == 0 && config.useHmul =>
+(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 == 0 =>
   (Sub64 <t>
     (Rsh64x64 <t>
       (Hmul64 <t> x (Const64 <typ.UInt64> [int64(smagic64(c).m/2)]))
@@ -126,13 +118,13 @@
     (Rsh64x64 <t> x (Const64 <typ.UInt64> [63])))
 
 // Case 3. Signed divides where m is odd.
-(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 && config.useHmul =>
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 =>
   (Sub32 <t>
     (Rsh32x64 <t>
       (Add32 <t> x (Hmul32 <t> x (Const32 <typ.UInt32> [int32(smagic32(c).m)])))
       (Const64 <typ.UInt64> [smagic32(c).s]))
     (Rsh32x64 <t> x (Const64 <typ.UInt64> [31])))
-(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 != 0 && config.useHmul =>
+(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 != 0 =>
   (Sub64 <t>
     (Rsh64x64 <t>
       (Add64 <t> x (Hmul64 <t> x (Const64 <typ.UInt64> [int64(smagic64(c).m)])))
@@ -149,11 +141,11 @@
   (Rsh64Ux64 <t>
     (Mul64 <typ.UInt64> (SignExt32to64 x) (Const64 <typ.UInt64> [int64(smagic32(c).m)]))
     (Const64 <typ.UInt64> [32 + smagic32(c).s]))
-(Div32u <t> x (Const32 [c])) && t.IsSigned() && smagicOK32(c) && config.RegSize == 4 && config.useHmul =>
+(Div32u <t> x (Const32 [c])) && t.IsSigned() && smagicOK32(c) && config.RegSize == 4 =>
   (Rsh32Ux64 <t>
     (Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(smagic32(c).m)]))
     (Const64 <typ.UInt64> [smagic32(c).s]))
-(Div64u <t> x (Const64 [c])) && t.IsSigned() && smagicOK64(c) && config.useHmul =>
+(Div64u <t> x (Const64 [c])) && t.IsSigned() && smagicOK64(c) =>
   (Rsh64Ux64 <t>
     (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(smagic64(c).m)]))
     (Const64 <typ.UInt64> [smagic64(c).s]))
@@ -181,11 +173,11 @@
     (Rsh64Ux64 <typ.UInt64>
       (Mul64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [int64(1<<31 + umagic32(c).m/2)]))
       (Const64 <typ.UInt64> [32 + umagic32(c).s - 1])))
-(Div32u <t> x (Const32 [c])) && umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4 && config.useHmul =>
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4 =>
   (Rsh32Ux64 <t>
     (Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(1<<31 + umagic32(c).m/2)]))
     (Const64 <typ.UInt64> [umagic32(c).s - 1]))
-(Div64u <t> x (Const64 [c])) && umagicOK64(c) && umagic64(c).m&1 == 0 && config.useHmul =>
+(Div64u <t> x (Const64 [c])) && umagicOK64(c) && umagic64(c).m&1 == 0 =>
   (Rsh64Ux64 <t>
     (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(1<<63 + umagic64(c).m/2)]))
     (Const64 <typ.UInt64> [umagic64(c).s - 1]))
@@ -205,39 +197,39 @@
         (Rsh64Ux64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [1]))
         (Const64 <typ.UInt64> [int64(1<<31 + (umagic32(c).m+1)/2)]))
       (Const64 <typ.UInt64> [32 + umagic32(c).s - 2])))
-(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && c&1 == 0 && config.useHmul =>
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && c&1 == 0 =>
   (Rsh32Ux64 <t>
     (Hmul32u <typ.UInt32>
       (Rsh32Ux64 <typ.UInt32> x (Const64 <typ.UInt64> [1]))
       (Const32 <typ.UInt32> [int32(1<<31 + (umagic32(c).m+1)/2)]))
     (Const64 <typ.UInt64> [umagic32(c).s - 2]))
-(Div64u <t> x (Const64 [c])) && umagicOK64(c) && c&1 == 0 && config.useHmul =>
+(Div64u <t> x (Const64 [c])) && umagicOK64(c) && c&1 == 0 =>
   (Rsh64Ux64 <t>
     (Hmul64u <typ.UInt64>
       (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [1]))
       (Const64 <typ.UInt64> [int64(1<<63 + (umagic64(c).m+1)/2)]))
     (Const64 <typ.UInt64> [umagic64(c).s - 2]))
 
-// Case 8. Unsigned divide on systems with avg.
-(Div16u <t> x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && config.useAvg =>
+// Case 8. Unsigned divide using avg.
+(Div16u <t> x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 =>
   (Trunc32to16 <t>
     (Rsh32Ux64 <typ.UInt32>
       (Avg32u
         (Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16]))
         (Mul32 <typ.UInt32> (ZeroExt16to32 x) (Const32 <typ.UInt32> [int32(umagic16(c).m)])))
       (Const64 <typ.UInt64> [16 + umagic16(c).s - 1])))
-(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.useAvg =>
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 =>
   (Trunc64to32 <t>
     (Rsh64Ux64 <typ.UInt64>
       (Avg64u
         (Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32]))
         (Mul64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt32> [int64(umagic32(c).m)])))
       (Const64 <typ.UInt64> [32 + umagic32(c).s - 1])))
-(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && config.useAvg && config.useHmul =>
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 =>
   (Rsh32Ux64 <t>
     (Avg32u x (Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(umagic32(c).m)])))
     (Const64 <typ.UInt64> [umagic32(c).s - 1]))
-(Div64u <t> x (Const64 [c])) && umagicOK64(c) && config.useAvg && config.useHmul =>
+(Div64u <t> x (Const64 [c])) && umagicOK64(c) =>
   (Rsh64Ux64 <t>
     (Avg64u x (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(umagic64(c).m)])))
     (Const64 <typ.UInt64> [umagic64(c).s - 1]))
diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go
index 819d77e420..ec0240941c 100644
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -41,8 +41,6 @@ type Config struct {
 	hasGReg        bool      // has hardware g register
 	ctxt           *obj.Link // Generic arch information
 	optimize       bool      // Do optimization
-	useAvg         bool      // Use optimizations that need Avg* operations
-	useHmul        bool      // Use optimizations that need Hmul* operations
 	SoftFloat      bool      //
 	Race           bool      // race detector enabled
 	BigEndian      bool      //
@@ -168,8 +166,6 @@ type Frontend interface {
 // NewConfig returns a new configuration object for the given architecture.
 func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat bool) *Config {
 	c := &Config{arch: arch, Types: types}
-	c.useAvg = true
-	c.useHmul = true
 	switch arch {
 	case "amd64":
 		c.PtrSize = 8
@@ -359,8 +355,6 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat boo
 		c.FPReg = framepointerRegWasm
 		c.LinkReg = linkRegWasm
 		c.hasGReg = true
-		c.useAvg = false
-		c.useHmul = false
 		c.unalignedOK = true
 		c.haveCondSelect = true
 	default:
diff --git a/src/cmd/compile/internal/ssa/rewriteWasm.go b/src/cmd/compile/internal/ssa/rewriteWasm.go
index a164a6eee5..faba41b3e5 100644
--- a/src/cmd/compile/internal/ssa/rewriteWasm.go
+++ b/src/cmd/compile/internal/ssa/rewriteWasm.go
@@ -48,6 +48,8 @@ func rewriteValueWasm(v *Value) bool {
 	case OpAndB:
 		v.Op = OpWasmI64And
 		return true
+	case OpAvg64u:
+		return rewriteValueWasm_OpAvg64u(v)
 	case OpBitLen16:
 		return rewriteValueWasm_OpBitLen16(v)
 	case OpBitLen32:
@@ -228,6 +230,10 @@ func rewriteValueWasm(v *Value) bool {
 	case OpGetClosurePtr:
 		v.Op = OpWasmLoweredGetClosurePtr
 		return true
+	case OpHmul64:
+		return rewriteValueWasm_OpHmul64(v)
+	case OpHmul64u:
+		return rewriteValueWasm_OpHmul64u(v)
 	case OpInterCall:
 		v.Op = OpWasmLoweredInterCall
 		return true
@@ -239,6 +245,8 @@ func rewriteValueWasm(v *Value) bool {
 	case OpIsSliceInBounds:
 		v.Op = OpWasmI64LeU
 		return true
+	case OpLast:
+		return rewriteValueWasm_OpLast(v)
 	case OpLeq16:
 		return rewriteValueWasm_OpLeq16(v)
 	case OpLeq16U:
@@ -514,6 +522,10 @@ func rewriteValueWasm(v *Value) bool {
 		return rewriteValueWasm_OpRsh8x64(v)
 	case OpRsh8x8:
 		return rewriteValueWasm_OpRsh8x8(v)
+	case OpSelect0:
+		return rewriteValueWasm_OpSelect0(v)
+	case OpSelect1:
+		return rewriteValueWasm_OpSelect1(v)
 	case OpSignExt16to32:
 		return rewriteValueWasm_OpSignExt16to32(v)
 	case OpSignExt16to64:
@@ -684,6 +696,27 @@ func rewriteValueWasm_OpAddr(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueWasm_OpAvg64u(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (Avg64u x y)
+	// result: (I64Add (I64ShrU (I64Sub x y) (I64Const [1])) y)
+	for {
+		x := v_0
+		y := v_1
+		v.reset(OpWasmI64Add)
+		v0 := b.NewValue0(v.Pos, OpWasmI64ShrU, typ.Int64)
+		v1 := b.NewValue0(v.Pos, OpWasmI64Sub, typ.Int64)
+		v1.AddArg2(x, y)
+		v2 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v2.AuxInt = int64ToAuxInt(1)
+		v0.AddArg2(v1, v2)
+		v.AddArg2(v0, y)
+		return true
+	}
+}
 func rewriteValueWasm_OpBitLen16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
@@ -1162,6 +1195,108 @@ func rewriteValueWasm_OpEq8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueWasm_OpHmul64(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (Hmul64 <t> x y)
+	// result: (Last <t> x0: (ZeroExt32to64 x) x1: (I64ShrS x (I64Const [32])) y0: (ZeroExt32to64 y) y1: (I64ShrS y (I64Const [32])) x0y0: (I64Mul x0 y0) tt: (I64Add (I64Mul x1 y0) (I64ShrU x0y0 (I64Const [32]))) w1: (I64Add (I64Mul x0 y1) (ZeroExt32to64 tt)) w2: (I64ShrS tt (I64Const [32])) (I64Add (I64Add (I64Mul x1 y1) w2) (I64ShrS w1 (I64Const [32]))))
+	for {
+		t := v.Type
+		x := v_0
+		y := v_1
+		v.reset(OpLast)
+		v.Type = t
+		x0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+		x0.AddArg(x)
+		x1 := b.NewValue0(v.Pos, OpWasmI64ShrS, typ.Int64)
+		v2 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v2.AuxInt = int64ToAuxInt(32)
+		x1.AddArg2(x, v2)
+		y0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+		y0.AddArg(y)
+		y1 := b.NewValue0(v.Pos, OpWasmI64ShrS, typ.Int64)
+		y1.AddArg2(y, v2)
+		x0y0 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		x0y0.AddArg2(x0, y0)
+		tt := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v7 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		v7.AddArg2(x1, y0)
+		v8 := b.NewValue0(v.Pos, OpWasmI64ShrU, typ.Int64)
+		v8.AddArg2(x0y0, v2)
+		tt.AddArg2(v7, v8)
+		w1 := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v10 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		v10.AddArg2(x0, y1)
+		v11 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+		v11.AddArg(tt)
+		w1.AddArg2(v10, v11)
+		w2 := b.NewValue0(v.Pos, OpWasmI64ShrS, typ.Int64)
+		w2.AddArg2(tt, v2)
+		v13 := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v14 := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v15 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		v15.AddArg2(x1, y1)
+		v14.AddArg2(v15, w2)
+		v16 := b.NewValue0(v.Pos, OpWasmI64ShrS, typ.Int64)
+		v16.AddArg2(w1, v2)
+		v13.AddArg2(v14, v16)
+		v.AddArgs(x0, x1, y0, y1, x0y0, tt, w1, w2, v13)
+		return true
+	}
+}
+func rewriteValueWasm_OpHmul64u(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (Hmul64u <t> x y)
+	// result: (Last <t> x0: (ZeroExt32to64 x) x1: (I64ShrU x (I64Const [32])) y0: (ZeroExt32to64 y) y1: (I64ShrU y (I64Const [32])) w0: (I64Mul x0 y0) tt: (I64Add (I64Mul x1 y0) (I64ShrU w0 (I64Const [32]))) w1: (I64Add (I64Mul x0 y1) (ZeroExt32to64 tt)) w2: (I64ShrU tt (I64Const [32])) hi: (I64Add (I64Add (I64Mul x1 y1) w2) (I64ShrU w1 (I64Const [32]))))
+	for {
+		t := v.Type
+		x := v_0
+		y := v_1
+		v.reset(OpLast)
+		v.Type = t
+		x0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+		x0.AddArg(x)
+		x1 := b.NewValue0(v.Pos, OpWasmI64ShrU, typ.Int64)
+		v2 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v2.AuxInt = int64ToAuxInt(32)
+		x1.AddArg2(x, v2)
+		y0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+		y0.AddArg(y)
+		y1 := b.NewValue0(v.Pos, OpWasmI64ShrU, typ.Int64)
+		y1.AddArg2(y, v2)
+		w0 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		w0.AddArg2(x0, y0)
+		tt := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v7 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		v7.AddArg2(x1, y0)
+		v8 := b.NewValue0(v.Pos, OpWasmI64ShrU, typ.Int64)
+		v8.AddArg2(w0, v2)
+		tt.AddArg2(v7, v8)
+		w1 := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v10 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		v10.AddArg2(x0, y1)
+		v11 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+		v11.AddArg(tt)
+		w1.AddArg2(v10, v11)
+		w2 := b.NewValue0(v.Pos, OpWasmI64ShrU, typ.Int64)
+		w2.AddArg2(tt, v2)
+		hi := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v14 := b.NewValue0(v.Pos, OpWasmI64Add, typ.Int64)
+		v15 := b.NewValue0(v.Pos, OpWasmI64Mul, typ.Int64)
+		v15.AddArg2(x1, y1)
+		v14.AddArg2(v15, w2)
+		v16 := b.NewValue0(v.Pos, OpWasmI64ShrU, typ.Int64)
+		v16.AddArg2(w1, v2)
+		hi.AddArg2(v14, v16)
+		v.AddArgs(x0, x1, y0, y1, w0, tt, w1, w2, hi)
+		return true
+	}
+}
 func rewriteValueWasm_OpIsNonNil(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
@@ -1177,6 +1312,14 @@ func rewriteValueWasm_OpIsNonNil(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueWasm_OpLast(v *Value) bool {
+	// match: (Last ___)
+	// result: v.Args[len(v.Args)-1]
+	for {
+		v.copyOf(v.Args[len(v.Args)-1])
+		return true
+	}
+}
 func rewriteValueWasm_OpLeq16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -3199,6 +3342,40 @@ func rewriteValueWasm_OpRsh8x8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueWasm_OpSelect0(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Select0 <t> (Mul64uhilo x y))
+	// result: (Hmul64u <t> x y)
+	for {
+		t := v.Type
+		if v_0.Op != OpMul64uhilo {
+			break
+		}
+		y := v_0.Args[1]
+		x := v_0.Args[0]
+		v.reset(OpHmul64u)
+		v.Type = t
+		v.AddArg2(x, y)
+		return true
+	}
+	return false
+}
+func rewriteValueWasm_OpSelect1(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Select1 <t> (Mul64uhilo x y))
+	// result: (I64Mul x y)
+	for {
+		if v_0.Op != OpMul64uhilo {
+			break
+		}
+		y := v_0.Args[1]
+		x := v_0.Args[0]
+		v.reset(OpWasmI64Mul)
+		v.AddArg2(x, y)
+		return true
+	}
+	return false
+}
 func rewriteValueWasm_OpSignExt16to32(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (SignExt16to32 x:(I64Load16S _ _))
diff --git a/src/cmd/compile/internal/ssa/rewritedivmod.go b/src/cmd/compile/internal/ssa/rewritedivmod.go
index 02978075a8..ab5cf7d676 100644
--- a/src/cmd/compile/internal/ssa/rewritedivmod.go
+++ b/src/cmd/compile/internal/ssa/rewritedivmod.go
@@ -212,7 +212,7 @@ func rewriteValuedivmod_OpDiv16u(v *Value) bool {
 		return true
 	}
 	// match: (Div16u <t> x (Const16 [c]))
-	// cond: umagicOK16(c) && config.RegSize == 4 && config.useAvg
+	// cond: umagicOK16(c) && config.RegSize == 4
 	// result: (Trunc32to16 <t> (Rsh32Ux64 <typ.UInt32> (Avg32u (Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16])) (Mul32 <typ.UInt32> (ZeroExt16to32 x) (Const32 <typ.UInt32> [int32(umagic16(c).m)]))) (Const64 <typ.UInt64> [16 + umagic16(c).s - 1])))
 	for {
 		t := v.Type
@@ -221,7 +221,7 @@ func rewriteValuedivmod_OpDiv16u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt16(v_1.AuxInt)
-		if !(umagicOK16(c) && config.RegSize == 4 && config.useAvg) {
+		if !(umagicOK16(c) && config.RegSize == 4) {
 			break
 		}
 		v.reset(OpTrunc32to16)
@@ -315,7 +315,7 @@ func rewriteValuedivmod_OpDiv32(v *Value) bool {
 		return true
 	}
 	// match: (Div32 <t> x (Const32 [c]))
-	// cond: smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 && config.useHmul
+	// cond: smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0
 	// result: (Sub32 <t> (Rsh32x64 <t> (Hmul32 <t> x (Const32 <typ.UInt32> [int32(smagic32(c).m/2)])) (Const64 <typ.UInt64> [smagic32(c).s - 1])) (Rsh32x64 <t> x (Const64 <typ.UInt64> [31])))
 	for {
 		t := v.Type
@@ -324,7 +324,7 @@ func rewriteValuedivmod_OpDiv32(v *Value) bool {
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 && config.useHmul) {
+		if !(smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0) {
 			break
 		}
 		v.reset(OpSub32)
@@ -345,7 +345,7 @@ func rewriteValuedivmod_OpDiv32(v *Value) bool {
 		return true
 	}
 	// match: (Div32 <t> x (Const32 [c]))
-	// cond: smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 && config.useHmul
+	// cond: smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0
 	// result: (Sub32 <t> (Rsh32x64 <t> (Add32 <t> x (Hmul32 <t> x (Const32 <typ.UInt32> [int32(smagic32(c).m)]))) (Const64 <typ.UInt64> [smagic32(c).s])) (Rsh32x64 <t> x (Const64 <typ.UInt64> [31])))
 	for {
 		t := v.Type
@@ -354,7 +354,7 @@ func rewriteValuedivmod_OpDiv32(v *Value) bool {
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 && config.useHmul) {
+		if !(smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0) {
 			break
 		}
 		v.reset(OpSub32)
@@ -411,7 +411,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 		return true
 	}
 	// match: (Div32u <t> x (Const32 [c]))
-	// cond: t.IsSigned() && smagicOK32(c) && config.RegSize == 4 && config.useHmul
+	// cond: t.IsSigned() && smagicOK32(c) && config.RegSize == 4
 	// result: (Rsh32Ux64 <t> (Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(smagic32(c).m)])) (Const64 <typ.UInt64> [smagic32(c).s]))
 	for {
 		t := v.Type
@@ -420,7 +420,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(t.IsSigned() && smagicOK32(c) && config.RegSize == 4 && config.useHmul) {
+		if !(t.IsSigned() && smagicOK32(c) && config.RegSize == 4) {
 			break
 		}
 		v.reset(OpRsh32Ux64)
@@ -463,7 +463,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 		return true
 	}
 	// match: (Div32u <t> x (Const32 [c]))
-	// cond: umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4 && config.useHmul
+	// cond: umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4
 	// result: (Rsh32Ux64 <t> (Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(1<<31 + umagic32(c).m/2)])) (Const64 <typ.UInt64> [umagic32(c).s - 1]))
 	for {
 		t := v.Type
@@ -472,7 +472,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4 && config.useHmul) {
+		if !(umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4) {
 			break
 		}
 		v.reset(OpRsh32Ux64)
@@ -519,7 +519,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 		return true
 	}
 	// match: (Div32u <t> x (Const32 [c]))
-	// cond: umagicOK32(c) && config.RegSize == 4 && c&1 == 0 && config.useHmul
+	// cond: umagicOK32(c) && config.RegSize == 4 && c&1 == 0
 	// result: (Rsh32Ux64 <t> (Hmul32u <typ.UInt32> (Rsh32Ux64 <typ.UInt32> x (Const64 <typ.UInt64> [1])) (Const32 <typ.UInt32> [int32(1<<31 + (umagic32(c).m+1)/2)])) (Const64 <typ.UInt64> [umagic32(c).s - 2]))
 	for {
 		t := v.Type
@@ -528,7 +528,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(umagicOK32(c) && config.RegSize == 4 && c&1 == 0 && config.useHmul) {
+		if !(umagicOK32(c) && config.RegSize == 4 && c&1 == 0) {
 			break
 		}
 		v.reset(OpRsh32Ux64)
@@ -547,7 +547,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 		return true
 	}
 	// match: (Div32u <t> x (Const32 [c]))
-	// cond: umagicOK32(c) && config.RegSize == 8 && config.useAvg
+	// cond: umagicOK32(c) && config.RegSize == 8
 	// result: (Trunc64to32 <t> (Rsh64Ux64 <typ.UInt64> (Avg64u (Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32])) (Mul64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt32> [int64(umagic32(c).m)]))) (Const64 <typ.UInt64> [32 + umagic32(c).s - 1])))
 	for {
 		t := v.Type
@@ -556,7 +556,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(umagicOK32(c) && config.RegSize == 8 && config.useAvg) {
+		if !(umagicOK32(c) && config.RegSize == 8) {
 			break
 		}
 		v.reset(OpTrunc64to32)
@@ -581,7 +581,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 		return true
 	}
 	// match: (Div32u <t> x (Const32 [c]))
-	// cond: umagicOK32(c) && config.RegSize == 4 && config.useAvg && config.useHmul
+	// cond: umagicOK32(c) && config.RegSize == 4
 	// result: (Rsh32Ux64 <t> (Avg32u x (Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(umagic32(c).m)]))) (Const64 <typ.UInt64> [umagic32(c).s - 1]))
 	for {
 		t := v.Type
@@ -590,7 +590,7 @@ func rewriteValuedivmod_OpDiv32u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(umagicOK32(c) && config.RegSize == 4 && config.useAvg && config.useHmul) {
+		if !(umagicOK32(c) && config.RegSize == 4) {
 			break
 		}
 		v.reset(OpRsh32Ux64)
@@ -612,7 +612,6 @@ func rewriteValuedivmod_OpDiv64(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	config := b.Func.Config
 	typ := &b.Func.Config.Types
 	// match: (Div64 <t> n (Const64 [c]))
 	// cond: isPowerOfTwo(c)
@@ -644,7 +643,7 @@ func rewriteValuedivmod_OpDiv64(v *Value) bool {
 		return true
 	}
 	// match: (Div64 <t> x (Const64 [c]))
-	// cond: smagicOK64(c) && smagic64(c).m&1 == 0 && config.useHmul
+	// cond: smagicOK64(c) && smagic64(c).m&1 == 0
 	// result: (Sub64 <t> (Rsh64x64 <t> (Hmul64 <t> x (Const64 <typ.UInt64> [int64(smagic64(c).m/2)])) (Const64 <typ.UInt64> [smagic64(c).s - 1])) (Rsh64x64 <t> x (Const64 <typ.UInt64> [63])))
 	for {
 		t := v.Type
@@ -653,7 +652,7 @@ func rewriteValuedivmod_OpDiv64(v *Value) bool {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
-		if !(smagicOK64(c) && smagic64(c).m&1 == 0 && config.useHmul) {
+		if !(smagicOK64(c) && smagic64(c).m&1 == 0) {
 			break
 		}
 		v.reset(OpSub64)
@@ -674,7 +673,7 @@ func rewriteValuedivmod_OpDiv64(v *Value) bool {
 		return true
 	}
 	// match: (Div64 <t> x (Const64 [c]))
-	// cond: smagicOK64(c) && smagic64(c).m&1 != 0 && config.useHmul
+	// cond: smagicOK64(c) && smagic64(c).m&1 != 0
 	// result: (Sub64 <t> (Rsh64x64 <t> (Add64 <t> x (Hmul64 <t> x (Const64 <typ.UInt64> [int64(smagic64(c).m)]))) (Const64 <typ.UInt64> [smagic64(c).s])) (Rsh64x64 <t> x (Const64 <typ.UInt64> [63])))
 	for {
 		t := v.Type
@@ -683,7 +682,7 @@ func rewriteValuedivmod_OpDiv64(v *Value) bool {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
-		if !(smagicOK64(c) && smagic64(c).m&1 != 0 && config.useHmul) {
+		if !(smagicOK64(c) && smagic64(c).m&1 != 0) {
 			break
 		}
 		v.reset(OpSub64)
@@ -711,10 +710,9 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	config := b.Func.Config
 	typ := &b.Func.Config.Types
 	// match: (Div64u <t> x (Const64 [c]))
-	// cond: t.IsSigned() && smagicOK64(c) && config.useHmul
+	// cond: t.IsSigned() && smagicOK64(c)
 	// result: (Rsh64Ux64 <t> (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(smagic64(c).m)])) (Const64 <typ.UInt64> [smagic64(c).s]))
 	for {
 		t := v.Type
@@ -723,7 +721,7 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
-		if !(t.IsSigned() && smagicOK64(c) && config.useHmul) {
+		if !(t.IsSigned() && smagicOK64(c)) {
 			break
 		}
 		v.reset(OpRsh64Ux64)
@@ -738,7 +736,7 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 		return true
 	}
 	// match: (Div64u <t> x (Const64 [c]))
-	// cond: umagicOK64(c) && umagic64(c).m&1 == 0 && config.useHmul
+	// cond: umagicOK64(c) && umagic64(c).m&1 == 0
 	// result: (Rsh64Ux64 <t> (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(1<<63 + umagic64(c).m/2)])) (Const64 <typ.UInt64> [umagic64(c).s - 1]))
 	for {
 		t := v.Type
@@ -747,7 +745,7 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
-		if !(umagicOK64(c) && umagic64(c).m&1 == 0 && config.useHmul) {
+		if !(umagicOK64(c) && umagic64(c).m&1 == 0) {
 			break
 		}
 		v.reset(OpRsh64Ux64)
@@ -762,7 +760,7 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 		return true
 	}
 	// match: (Div64u <t> x (Const64 [c]))
-	// cond: umagicOK64(c) && c&1 == 0 && config.useHmul
+	// cond: umagicOK64(c) && c&1 == 0
 	// result: (Rsh64Ux64 <t> (Hmul64u <typ.UInt64> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [1])) (Const64 <typ.UInt64> [int64(1<<63 + (umagic64(c).m+1)/2)])) (Const64 <typ.UInt64> [umagic64(c).s - 2]))
 	for {
 		t := v.Type
@@ -771,7 +769,7 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
-		if !(umagicOK64(c) && c&1 == 0 && config.useHmul) {
+		if !(umagicOK64(c) && c&1 == 0) {
 			break
 		}
 		v.reset(OpRsh64Ux64)
@@ -790,7 +788,7 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 		return true
 	}
 	// match: (Div64u <t> x (Const64 [c]))
-	// cond: umagicOK64(c) && config.useAvg && config.useHmul
+	// cond: umagicOK64(c)
 	// result: (Rsh64Ux64 <t> (Avg64u x (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(umagic64(c).m)]))) (Const64 <typ.UInt64> [umagic64(c).s - 1]))
 	for {
 		t := v.Type
@@ -799,7 +797,7 @@ func rewriteValuedivmod_OpDiv64u(v *Value) bool {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
-		if !(umagicOK64(c) && config.useAvg && config.useHmul) {
+		if !(umagicOK64(c)) {
 			break
 		}
 		v.reset(OpRsh64Ux64)
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
index 190c4840ce..78a4235116 100644
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1219,11 +1219,11 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
 
 	alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...)
 
-	addF("math/bits", "Mul64",
+	add("math/bits", "Mul64",
 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
 		},
-		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.PPC64, sys.S390X, sys.MIPS64, sys.MIPS, sys.RISCV64, sys.Loong64)
+		all...)
 	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
 	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
 	addF("math/bits", "Add64",
diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
index 782426215c..713adc0e8b 100644
--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
@@ -1328,6 +1328,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
 	{"s390x", "sync/atomic", "SwapUint64"}:                             struct{}{},
 	{"s390x", "sync/atomic", "SwapUintptr"}:                            struct{}{},
 	{"s390x", "crypto/internal/constanttime", "boolToUint8"}:           struct{}{},
+	{"wasm", "internal/runtime/math", "Mul64"}:                         struct{}{},
 	{"wasm", "internal/runtime/sys", "GetCallerPC"}:                    struct{}{},
 	{"wasm", "internal/runtime/sys", "GetCallerSP"}:                    struct{}{},
 	{"wasm", "internal/runtime/sys", "GetClosurePtr"}:                  struct{}{},
@@ -1344,11 +1345,14 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
 	{"wasm", "math", "RoundToEven"}:                                    struct{}{},
 	{"wasm", "math", "Trunc"}:                                          struct{}{},
 	{"wasm", "math", "sqrt"}:                                           struct{}{},
+	{"wasm", "math/big", "mulWW"}:                                      struct{}{},
 	{"wasm", "math/bits", "Len"}:                                       struct{}{},
 	{"wasm", "math/bits", "Len16"}:                                     struct{}{},
 	{"wasm", "math/bits", "Len32"}:                                     struct{}{},
 	{"wasm", "math/bits", "Len64"}:                                     struct{}{},
 	{"wasm", "math/bits", "Len8"}:                                      struct{}{},
+	{"wasm", "math/bits", "Mul"}:                                       struct{}{},
+	{"wasm", "math/bits", "Mul64"}:                                     struct{}{},
 	{"wasm", "math/bits", "OnesCount"}:                                 struct{}{},
 	{"wasm", "math/bits", "OnesCount16"}:                               struct{}{},
 	{"wasm", "math/bits", "OnesCount32"}:                               struct{}{},
diff --git a/test/codegen/divmod.go b/test/codegen/divmod.go
index 98d0852398..9de091af7a 100644
--- a/test/codegen/divmod.go
+++ b/test/codegen/divmod.go
@@ -124,6 +124,7 @@ func div7_int8(i int8) int8 {
 	// arm64: "MULW"
 	// arm64: "SBFX [$]10, R[0-9]+, [$]22,"
 	// arm64: "SUB R[0-9]+->31,"
+	// wasm: "I64Const [$]147"
 	return i / 7
 }
 
@@ -136,6 +137,7 @@ func div7_int16(i int16) int16 {
 	// arm64: "MULW"
 	// arm64: "SBFX [$]18, R[0-9]+, [$]14,"
 	// arm64: "SUB R[0-9]+->31,"
+	// wasm: "I64Const [$]37450"
 	return i / 7
 }
 
@@ -145,6 +147,7 @@ func div7_int32(i int32) int32 {
 	// arm64: "MUL "
 	// arm64: "ASR [$]34,"
 	// arm64: "SUB R[0-9]+->63,"
+	// wasm: "I64Const [$]2454267027"
 	return i / 7
 }
 
@@ -160,6 +163,7 @@ func div9_int32(i int32) int32 {
 	// arm64: "MUL "
 	// arm64: "ASR [$]35,"
 	// arm64: "SUB R[0-9]+->63,"
+	// wasm: "I64Const [$]3817748708"
 	return i / 9
 }
 
@@ -170,6 +174,8 @@ func div7_int64(i int64) int64 {
 	// arm64: "SMULH"
 	// arm64: "ASR [$]1,"
 	// arm64: "SUB R[0-9]+->63,"
+	// wasm: "I64Const [$]613566757"
+	// wasm: "I64Const [$]1227133513"
 	return i / 7
 }
 
@@ -185,6 +191,7 @@ func div3_int32(i int32) int32 {
 	// arm64: "MUL"
 	// arm64: "ASR [$]33,"
 	// arm64: "SUB R[0-9]+->63,"
+	// wasm: "I64Const [$]2863311531"
 	return i / 3
 }
 
@@ -195,6 +202,8 @@ func div3_int64(i int64) int64 {
 	// arm64: "ADD"
 	// arm64: "ASR [$]1,"
 	// arm64: "SUB R[0-9]+->63,"
+	// wasm: "I64Const [$]-1431655766"
+	// wasm: "I64Const [$]2863311531"
 	return i / 3
 }
 
@@ -211,6 +220,8 @@ func div7_int16u(i int16) int16 {
 	// arm64: "MULW"
 	// arm64: "UBFX [$]18, R[0-9]+, [$]14,"
 	// arm64: -"SUB"
+	// wasm: "I64Const [$]37450"
+	// wasm -"I64Sub"
 	return i / 7
 }
 
@@ -226,6 +237,8 @@ func div7_int32u(i int32) int32 {
 	// arm64: "MUL"
 	// arm64: "LSR [$]34,"
 	// arm64: -"SUB"
+	// wasm: "I64Const [$]2454267027"
+	// wasm -"I64Sub"
 	return i / 7
 }
 
@@ -238,6 +251,9 @@ func div7_int64u(i int64) int64 {
 	// arm64: "UMULH"
 	// arm64: "LSR [$]2,"
 	// arm64: -"SUB"
+	// wasm: "I64Const [$]1227133514"
+	// wasm: "I64Const [$]2454267026"
+	// wasm -"I64Sub"
 	return i / 7
 }
 
@@ -249,6 +265,7 @@ func div7_uint8(i uint8) uint8 {
 	// arm64: "MOVD [$]293,"
 	// arm64: "MULW"
 	// arm64: "UBFX [$]11, R[0-9]+, [$]21,"
+	// wasm: "I64Const [$]293"
 	return i / 7
 }
 
@@ -257,6 +274,7 @@ func div7_uint16(i uint16) uint16 {
 	// arm64: "MOVD [$]74899,"
 	// arm64: "MUL"
 	// arm64: "LSR [$]19,"
+	// wasm: "I64Const [$]74899"
 	return i / 7
 }
 
@@ -267,6 +285,7 @@ func div3_uint16(i uint16) uint16 {
 	// arm64: "MOVD [$]87382,"
 	// arm64: "MUL"
 	// arm64: "LSR [$]18,"
+	// wasm: "I64Const [$]87382"
 	return i / 3
 }
 
@@ -275,6 +294,7 @@ func div3_uint32(i uint32) uint32 {
 	// arm64: "MOVD [$]2863311531,"
 	// arm64: "MUL"
 	// arm64: "LSR [$]33,"
+	// wasm: "I64Const [$]2863311531"
 	return i / 3
 }
 
@@ -286,6 +306,8 @@ func div3_uint64(i uint64) uint64 {
 	// arm64: "MOVD [$]-6148914691236517205,"
 	// arm64: "UMULH"
 	// arm64: "LSR [$]1,"
+	// wasm: "I64Const [$]2863311530"
+	// wasm: "I64Const [$]2863311531"
 	return i / 3
 }
 
@@ -307,6 +329,7 @@ func div14_uint32(i uint32) uint32 {
 	// arm64: "MOVD [$]2454267027,"
 	// arm64: "MUL"
 	// arm64: "LSR [$]34,"
+	// wasm: "I64Const [$]2454267027"
 	return i / 14
 }
 
@@ -318,6 +341,8 @@ func div14_uint64(i uint64) uint64 {
 	// arm64: "MOVD [$]-7905747460161236406,"
 	// arm64: "UMULH"
 	// arm64: "LSR [$]2,"
+	// wasm: "I64Const [$]1227133514"
+	// wasm: "I64Const [$]2454267026"
 	return i / 14
 }
 
@@ -345,6 +370,7 @@ func div7_uint32(i uint32) uint32 {
 	// arm64: "SUB"
 	// arm64: "ADD R[0-9]+>>1,"
 	// arm64: "LSR [$]34,"
+	// wasm: "I64Const [$]613566757"
 	return i / 7
 }
 
@@ -358,6 +384,8 @@ func div7_uint64(i uint64) uint64 {
 	// arm64: "SUB",
 	// arm64: "ADD R[0-9]+>>1,"
 	// arm64: "LSR [$]2,"
+	// wasm: "I64Const [$]613566756"
+	// wasm: "I64Const [$]2454267027"
 	return i / 7
 }
 
@@ -370,6 +398,8 @@ func div12345_uint64(i uint64) uint64 {
 	// arm64: "MOVD [$]-6205696892516465602,"
 	// arm64: "UMULH"
 	// arm64: "LSR [$]13,"
+	// wasm: "I64Const [$]835683390"
+	// wasm: "I64Const [$]2850090894"
 	return i / 12345
 }
 
@@ -480,7 +510,7 @@ func div_divis32_uint8(i uint8) (uint8, bool) {
 	// arm64: "UBFX [$]5, R[0-9]+, [$]3"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_uint8(i uint8) (uint8, bool) {
@@ -490,7 +520,7 @@ func div_ndivis32_uint8(i uint8) (uint8, bool) {
 	// arm64: "UBFX [$]5, R[0-9]+, [$]3"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 func div_divis32_uint16(i uint16) (uint16, bool) {
@@ -500,7 +530,7 @@ func div_divis32_uint16(i uint16) (uint16, bool) {
 	// arm64: "UBFX [$]5, R[0-9]+, [$]11"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_uint16(i uint16) (uint16, bool) {
@@ -510,7 +540,7 @@ func div_ndivis32_uint16(i uint16) (uint16, bool) {
 	// arm64: "UBFX [$]5, R[0-9]+, [$]11,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 func div_divis32_uint32(i uint32) (uint32, bool) {
@@ -520,7 +550,7 @@ func div_divis32_uint32(i uint32) (uint32, bool) {
 	// arm64: "UBFX [$]5, R[0-9]+, [$]27,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_uint32(i uint32) (uint32, bool) {
@@ -530,7 +560,7 @@ func div_ndivis32_uint32(i uint32) (uint32, bool) {
 	// arm64: "UBFX [$]5, R[0-9]+, [$]27,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 func div_divis32_uint64(i uint64) (uint64, bool) {
@@ -541,7 +571,7 @@ func div_divis32_uint64(i uint64) (uint64, bool) {
 	// arm64: "LSR [$]5,"
 	// arm64: "TST [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_uint64(i uint64) (uint64, bool) {
@@ -552,7 +582,7 @@ func div_ndivis32_uint64(i uint64) (uint64, bool) {
 	// arm64: "LSR [$]5,"
 	// arm64: "TST [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 func div_divis32_int8(i int8) (int8, bool) {
@@ -566,7 +596,7 @@ func div_divis32_int8(i int8) (int8, bool) {
 	// arm64: "SBFX [$]5, R[0-9]+, [$]3,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_int8(i int8) (int8, bool) {
@@ -580,7 +610,7 @@ func div_ndivis32_int8(i int8) (int8, bool) {
 	// arm64: "SBFX [$]5, R[0-9]+, [$]3,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 func div_divis32_int16(i int16) (int16, bool) {
@@ -594,7 +624,7 @@ func div_divis32_int16(i int16) (int16, bool) {
 	// arm64: "SBFX [$]5, R[0-9]+, [$]11,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_int16(i int16) (int16, bool) {
@@ -608,7 +638,7 @@ func div_ndivis32_int16(i int16) (int16, bool) {
 	// arm64: "SBFX [$]5, R[0-9]+, [$]11,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 func div_divis32_int32(i int32) (int32, bool) {
@@ -622,7 +652,7 @@ func div_divis32_int32(i int32) (int32, bool) {
 	// arm64: "SBFX [$]5, R[0-9]+, [$]27,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_int32(i int32) (int32, bool) {
@@ -636,7 +666,7 @@ func div_ndivis32_int32(i int32) (int32, bool) {
 	// arm64: "SBFX [$]5, R[0-9]+, [$]27,"
 	// arm64: "TSTW [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 func div_divis32_int64(i int64) (int64, bool) {
@@ -651,7 +681,7 @@ func div_divis32_int64(i int64) (int64, bool) {
 	// arm64: "ASR [$]5,"
 	// arm64: "TST [$]31,"
 	// arm64: "CSET EQ"
-	return i/32, i%32 == 0
+	return i / 32, i%32 == 0
 }
 
 func div_ndivis32_int64(i int64) (int64, bool) {
@@ -666,7 +696,7 @@ func div_ndivis32_int64(i int64) (int64, bool) {
 	// arm64: "ASR [$]5,"
 	// arm64: "TST [$]31,"
 	// arm64: "CSET NE"
-	return i/32, i%32 != 0
+	return i / 32, i%32 != 0
 }
 
 // Divisibility and non-divisibility by non-power-of-two.
@@ -923,7 +953,7 @@ func div_divis6_uint8(i uint8) (uint8, bool) {
 	// arm64: "UBFX [$]11, R[0-9]+, [$]21,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_uint8(i uint8) (uint8, bool) {
@@ -936,7 +966,7 @@ func div_ndivis6_uint8(i uint8) (uint8, bool) {
 	// arm64: "UBFX [$]11, R[0-9]+, [$]21,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }
 
 func div_divis6_uint16(i uint16) (uint16, bool) {
@@ -950,7 +980,7 @@ func div_divis6_uint16(i uint16) (uint16, bool) {
 	// arm64: "LSR [$]19,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_uint16(i uint16) (uint16, bool) {
@@ -964,7 +994,7 @@ func div_ndivis6_uint16(i uint16) (uint16, bool) {
 	// arm64: "LSR [$]19,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }
 
 func div_divis6_uint32(i uint32) (uint32, bool) {
@@ -978,7 +1008,7 @@ func div_divis6_uint32(i uint32) (uint32, bool) {
 	// arm64: "LSR [$]34,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_uint32(i uint32) (uint32, bool) {
@@ -992,7 +1022,7 @@ func div_ndivis6_uint32(i uint32) (uint32, bool) {
 	// arm64: "LSR [$]34,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }
 
 func div_divis6_uint64(i uint64) (uint64, bool) {
@@ -1009,7 +1039,7 @@ func div_divis6_uint64(i uint64) (uint64, bool) {
 	// arm64: "LSR [$]2,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_uint64(i uint64) (uint64, bool) {
@@ -1026,7 +1056,7 @@ func div_ndivis6_uint64(i uint64) (uint64, bool) {
 	// arm64: "LSR [$]2,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }
 
 func div_divis6_int8(i int8) (int8, bool) {
@@ -1042,7 +1072,7 @@ func div_divis6_int8(i int8) (int8, bool) {
 	// arm64: "SUB R[0-9]+->31,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_int8(i int8) (int8, bool) {
@@ -1058,7 +1088,7 @@ func div_ndivis6_int8(i int8) (int8, bool) {
 	// arm64: "SUB R[0-9]+->31,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }
 
 func div_divis6_int16(i int16) (int16, bool) {
@@ -1074,7 +1104,7 @@ func div_divis6_int16(i int16) (int16, bool) {
 	// arm64: "SUB R[0-9]+->31,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_int16(i int16) (int16, bool) {
@@ -1090,7 +1120,7 @@ func div_ndivis6_int16(i int16) (int16, bool) {
 	// arm64: "SUB R[0-9]+->31,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }
 
 func div_divis6_int32(i int32) (int32, bool) {
@@ -1107,7 +1137,7 @@ func div_divis6_int32(i int32) (int32, bool) {
 	// arm64: "SUB R[0-9]+->63,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_int32(i int32) (int32, bool) {
@@ -1124,7 +1154,7 @@ func div_ndivis6_int32(i int32) (int32, bool) {
 	// arm64: "SUB R[0-9]+->63,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }
 
 func div_divis6_int64(i int64) (int64, bool) {
@@ -1145,7 +1175,7 @@ func div_divis6_int64(i int64) (int64, bool) {
 	// arm64: "SUB R[0-9]+->63,"
 	// arm64: "CSET EQ"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 == 0
+	return i / 6, i%6 == 0
 }
 
 func div_ndivis6_int64(i int64) (int64, bool) {
@@ -1166,5 +1196,5 @@ func div_ndivis6_int64(i int64) (int64, bool) {
 	// arm64: "SUB R[0-9]+->63,"
 	// arm64: "CSET NE"
 	// arm64: -"RO[RL]"
-	return i/6, i%6 != 0
+	return i / 6, i%6 != 0
 }