From 6e876f19857a8fbd259571080f7f91bc03276559 Mon Sep 17 00:00:00 2001
From: Michael Munday <mike.munday@ibm.com>
Date: Thu, 4 Jun 2020 10:55:01 -0700
Subject: [PATCH] cmd/compile: clean up and optimize s390x multiplication rules
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Some of the existing optimizations aren't triggered because they
are handled by the generic rules so this CL removes them. Also
some constraints were copied without much thought from the amd64
rules and they don't make sense on s390x, so we remove those
constraints.

Finally, add a 'multiply by the sum of two powers of two'
optimization. This makes sense on s390x as shifts are low latency
and can also sometimes be optimized further (especially if we add
support for RISBG instructions).

name                   old time/op  new time/op  delta
IntMulByConst/3-8      1.70ns Â±11%  1.10ns Â± 5%  -35.26%  (p=0.000 n=10+10)
IntMulByConst/5-8      1.64ns Â± 7%  1.10ns Â± 4%  -32.94%  (p=0.000 n=10+9)
IntMulByConst/12-8     1.65ns Â± 6%  1.20ns Â± 4%  -27.16%  (p=0.000 n=10+9)
IntMulByConst/120-8    1.66ns Â± 4%  1.22ns Â±13%  -26.43%  (p=0.000 n=10+10)
IntMulByConst/-120-8   1.65ns Â± 7%  1.19ns Â± 4%  -28.06%  (p=0.000 n=9+10)
IntMulByConst/65537-8  0.86ns Â± 9%  1.12ns Â±12%  +30.41%  (p=0.000 n=10+10)
IntMulByConst/65538-8  1.65ns Â± 5%  1.23ns Â± 5%  -25.11%  (p=0.000 n=10+10)

Change-Id: Ib196e6bff1e97febfd266134d0a2b2a62897989f
Reviewed-on: https://go-review.googlesource.com/c/go/+/248937
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/cmd/compile/internal/ssa/gen/S390X.rules  |  51 +++-
 src/cmd/compile/internal/ssa/rewriteS390X.go  | 268 ++++++++++--------
 .../compile/internal/test/mulconst_test.go    | 242 ++++++++++++++++
 test/codegen/arithmetic.go                    |   6 +
 4 files changed, 441 insertions(+), 126 deletions(-)
 create mode 100644 src/cmd/compile/internal/test/mulconst_test.go
diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules
index d3234c1a00..5e4c436ca1 100644
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -716,20 +716,40 @@
 (ANDWconst [0xFF] x) => (MOVBZreg x)
 (ANDWconst [0xFFFF] x) => (MOVHZreg x)
 
-// strength reduction
-(MULLDconst [-1] x) => (NEG x)
-(MULLDconst [0] _) => (MOVDconst [0])
-(MULLDconst [1] x) => x
-(MULLDconst [c] x) && isPowerOfTwo(c) -> (SLDconst [log2(c)] x)
-(MULLDconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUB (SLDconst <v.Type> [log2(c+1)] x) x)
-(MULLDconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADD (SLDconst <v.Type> [log2(c-1)] x) x)
-
-(MULLWconst [-1] x) => (NEGW x)
-(MULLWconst [0] _) => (MOVDconst [0])
-(MULLWconst [1] x) => x
-(MULLWconst [c] x) && isPowerOfTwo(c) -> (SLWconst [log2(c)] x)
-(MULLWconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBW (SLWconst <v.Type> [log2(c+1)] x) x)
-(MULLWconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADDW (SLWconst <v.Type> [log2(c-1)] x) x)
+// Strength reduce multiplication to the sum (or difference) of two powers of two.
+//
+// Examples:
+//     5x -> 4x + 1x
+//    10x -> 8x + 2x
+//   120x -> 128x - 8x
+//  -120x -> 8x - 128x
+//
+// We know that the rightmost bit of any positive value, once isolated, must either
+// be a power of 2 (because it is a single bit) or 0 (if the original value is 0).
+// In all of these rules we use a rightmost bit calculation to determine one operand
+// for the addition or subtraction. We then just need to calculate if the other
+// operand is a valid power of 2 before we can match the rule.
+//
+// Notes:
+//   - the generic rules have already matched single powers of two so we ignore them here
+//   - isPowerOfTwo32 asserts that its argument is greater than 0
+//   - c&(c-1) = clear rightmost bit
+//   - c&^(c-1) = isolate rightmost bit
+
+// c = 2Ë£ + 2Ê¸ => c - 2Ë£ = 2Ê¸
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c&(c-1))
+  => ((ADD|ADDW) (SL(D|W)const <t> x [int8(log32(c&(c-1)))])
+                 (SL(D|W)const <t> x [int8(log32(c&^(c-1)))]))
+
+// c = 2Ê¸ - 2Ë£ => c + 2Ë£ = 2Ê¸
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c+(c&^(c-1)))
+  => ((SUB|SUBW) (SL(D|W)const <t> x [int8(log32(c+(c&^(c-1))))])
+                 (SL(D|W)const <t> x [int8(log32(c&^(c-1)))]))
+
+// c = 2Ë£ - 2Ê¸ => -c + 2Ë£ = 2Ê¸
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(-c+(-c&^(-c-1)))
+  => ((SUB|SUBW) (SL(D|W)const <t> x [int8(log32(-c&^(-c-1)))])
+                 (SL(D|W)const <t> x [int8(log32(-c+(-c&^(-c-1))))]))
 
 // Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them).
 (ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(c+d) -> (MOVDaddr [c+d] {s} x)
@@ -1133,6 +1153,9 @@
 (XORconst [0] x)                  => x
 (XORWconst [c] x) && int32(c)==0   => x
 
+// Shifts by zero (may be inserted during multiplication strength reduction).
+((SLD|SLW|SRD|SRW|SRAD|SRAW)const x [0]) => x
+
 // Convert constant subtracts to constant adds.
 (SUBconst [c] x) && c != -(1<<31) => (ADDconst [-c] x)
 (SUBWconst [c] x) -> (ADDWconst [int64(int32(-c))] x)
diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go
index dc9b143562..536f8db320 100644
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@@ -732,8 +732,12 @@ func rewriteValueS390X(v *Value) bool {
 		return rewriteValueS390X_OpS390XRLLG(v)
 	case OpS390XSLD:
 		return rewriteValueS390X_OpS390XSLD(v)
+	case OpS390XSLDconst:
+		return rewriteValueS390X_OpS390XSLDconst(v)
 	case OpS390XSLW:
 		return rewriteValueS390X_OpS390XSLW(v)
+	case OpS390XSLWconst:
+		return rewriteValueS390X_OpS390XSLWconst(v)
 	case OpS390XSRAD:
 		return rewriteValueS390X_OpS390XSRAD(v)
 	case OpS390XSRADconst:
@@ -748,6 +752,8 @@ func rewriteValueS390X(v *Value) bool {
 		return rewriteValueS390X_OpS390XSRDconst(v)
 	case OpS390XSRW:
 		return rewriteValueS390X_OpS390XSRW(v)
+	case OpS390XSRWconst:
+		return rewriteValueS390X_OpS390XSRWconst(v)
 	case OpS390XSTM2:
 		return rewriteValueS390X_OpS390XSTM2(v)
 	case OpS390XSTMG2:
@@ -13853,81 +13859,64 @@ func rewriteValueS390X_OpS390XMULLD(v *Value) bool {
 func rewriteValueS390X_OpS390XMULLDconst(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (MULLDconst [-1] x)
-	// result: (NEG x)
+	// match: (MULLDconst <t> x [c])
+	// cond: isPowerOfTwo32(c&(c-1))
+	// result: (ADD (SLDconst <t> x [int8(log32(c&(c-1)))]) (SLDconst <t> x [int8(log32(c&^(c-1)))]))
 	for {
-		if auxIntToInt32(v.AuxInt) != -1 {
-			break
-		}
-		x := v_0
-		v.reset(OpS390XNEG)
-		v.AddArg(x)
-		return true
-	}
-	// match: (MULLDconst [0] _)
-	// result: (MOVDconst [0])
-	for {
-		if auxIntToInt32(v.AuxInt) != 0 {
-			break
-		}
-		v.reset(OpS390XMOVDconst)
-		v.AuxInt = int64ToAuxInt(0)
-		return true
-	}
-	// match: (MULLDconst [1] x)
-	// result: x
-	for {
-		if auxIntToInt32(v.AuxInt) != 1 {
-			break
-		}
-		x := v_0
-		v.copyOf(x)
-		return true
-	}
-	// match: (MULLDconst [c] x)
-	// cond: isPowerOfTwo(c)
-	// result: (SLDconst [log2(c)] x)
-	for {
-		c := v.AuxInt
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		if !(isPowerOfTwo(c)) {
+		if !(isPowerOfTwo32(c & (c - 1))) {
 			break
 		}
-		v.reset(OpS390XSLDconst)
-		v.AuxInt = log2(c)
-		v.AddArg(x)
+		v.reset(OpS390XADD)
+		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c & (c - 1))))
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
-	// match: (MULLDconst [c] x)
-	// cond: isPowerOfTwo(c+1) && c >= 15
-	// result: (SUB (SLDconst <v.Type> [log2(c+1)] x) x)
+	// match: (MULLDconst <t> x [c])
+	// cond: isPowerOfTwo32(c+(c&^(c-1)))
+	// result: (SUB (SLDconst <t> x [int8(log32(c+(c&^(c-1))))]) (SLDconst <t> x [int8(log32(c&^(c-1)))]))
 	for {
-		c := v.AuxInt
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		if !(isPowerOfTwo(c+1) && c >= 15) {
+		if !(isPowerOfTwo32(c + (c &^ (c - 1)))) {
 			break
 		}
 		v.reset(OpS390XSUB)
-		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, v.Type)
-		v0.AuxInt = log2(c + 1)
+		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c + (c &^ (c - 1)))))
 		v0.AddArg(x)
-		v.AddArg2(v0, x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
-	// match: (MULLDconst [c] x)
-	// cond: isPowerOfTwo(c-1) && c >= 17
-	// result: (ADD (SLDconst <v.Type> [log2(c-1)] x) x)
+	// match: (MULLDconst <t> x [c])
+	// cond: isPowerOfTwo32(-c+(-c&^(-c-1)))
+	// result: (SUB (SLDconst <t> x [int8(log32(-c&^(-c-1)))]) (SLDconst <t> x [int8(log32(-c+(-c&^(-c-1))))]))
 	for {
-		c := v.AuxInt
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		if !(isPowerOfTwo(c-1) && c >= 17) {
+		if !(isPowerOfTwo32(-c + (-c &^ (-c - 1)))) {
 			break
 		}
-		v.reset(OpS390XADD)
-		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, v.Type)
-		v0.AuxInt = log2(c - 1)
+		v.reset(OpS390XSUB)
+		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(-c &^ (-c - 1))))
 		v0.AddArg(x)
-		v.AddArg2(v0, x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(-c + (-c &^ (-c - 1)))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
 	// match: (MULLDconst [c] (MOVDconst [d]))
@@ -14097,81 +14086,64 @@ func rewriteValueS390X_OpS390XMULLW(v *Value) bool {
 func rewriteValueS390X_OpS390XMULLWconst(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (MULLWconst [-1] x)
-	// result: (NEGW x)
-	for {
-		if auxIntToInt32(v.AuxInt) != -1 {
-			break
-		}
-		x := v_0
-		v.reset(OpS390XNEGW)
-		v.AddArg(x)
-		return true
-	}
-	// match: (MULLWconst [0] _)
-	// result: (MOVDconst [0])
-	for {
-		if auxIntToInt32(v.AuxInt) != 0 {
-			break
-		}
-		v.reset(OpS390XMOVDconst)
-		v.AuxInt = int64ToAuxInt(0)
-		return true
-	}
-	// match: (MULLWconst [1] x)
-	// result: x
+	// match: (MULLWconst <t> x [c])
+	// cond: isPowerOfTwo32(c&(c-1))
+	// result: (ADDW (SLWconst <t> x [int8(log32(c&(c-1)))]) (SLWconst <t> x [int8(log32(c&^(c-1)))]))
 	for {
-		if auxIntToInt32(v.AuxInt) != 1 {
-			break
-		}
-		x := v_0
-		v.copyOf(x)
-		return true
-	}
-	// match: (MULLWconst [c] x)
-	// cond: isPowerOfTwo(c)
-	// result: (SLWconst [log2(c)] x)
-	for {
-		c := v.AuxInt
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		if !(isPowerOfTwo(c)) {
+		if !(isPowerOfTwo32(c & (c - 1))) {
 			break
 		}
-		v.reset(OpS390XSLWconst)
-		v.AuxInt = log2(c)
-		v.AddArg(x)
+		v.reset(OpS390XADDW)
+		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c & (c - 1))))
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
-	// match: (MULLWconst [c] x)
-	// cond: isPowerOfTwo(c+1) && c >= 15
-	// result: (SUBW (SLWconst <v.Type> [log2(c+1)] x) x)
+	// match: (MULLWconst <t> x [c])
+	// cond: isPowerOfTwo32(c+(c&^(c-1)))
+	// result: (SUBW (SLWconst <t> x [int8(log32(c+(c&^(c-1))))]) (SLWconst <t> x [int8(log32(c&^(c-1)))]))
 	for {
-		c := v.AuxInt
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		if !(isPowerOfTwo(c+1) && c >= 15) {
+		if !(isPowerOfTwo32(c + (c &^ (c - 1)))) {
 			break
 		}
 		v.reset(OpS390XSUBW)
-		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, v.Type)
-		v0.AuxInt = log2(c + 1)
+		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c + (c &^ (c - 1)))))
 		v0.AddArg(x)
-		v.AddArg2(v0, x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
-	// match: (MULLWconst [c] x)
-	// cond: isPowerOfTwo(c-1) && c >= 17
-	// result: (ADDW (SLWconst <v.Type> [log2(c-1)] x) x)
+	// match: (MULLWconst <t> x [c])
+	// cond: isPowerOfTwo32(-c+(-c&^(-c-1)))
+	// result: (SUBW (SLWconst <t> x [int8(log32(-c&^(-c-1)))]) (SLWconst <t> x [int8(log32(-c+(-c&^(-c-1))))]))
 	for {
-		c := v.AuxInt
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		if !(isPowerOfTwo(c-1) && c >= 17) {
+		if !(isPowerOfTwo32(-c + (-c &^ (-c - 1)))) {
 			break
 		}
-		v.reset(OpS390XADDW)
-		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, v.Type)
-		v0.AuxInt = log2(c - 1)
+		v.reset(OpS390XSUBW)
+		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(-c &^ (-c - 1))))
 		v0.AddArg(x)
-		v.AddArg2(v0, x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(-c + (-c &^ (-c - 1)))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
 	// match: (MULLWconst [c] (MOVDconst [d]))
@@ -16826,6 +16798,20 @@ func rewriteValueS390X_OpS390XSLD(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueS390X_OpS390XSLDconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SLDconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpS390XSLW(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -16960,6 +16946,20 @@ func rewriteValueS390X_OpS390XSLW(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueS390X_OpS390XSLWconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SLWconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpS390XSRAD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -17096,6 +17096,16 @@ func rewriteValueS390X_OpS390XSRAD(v *Value) bool {
 }
 func rewriteValueS390X_OpS390XSRADconst(v *Value) bool {
 	v_0 := v.Args[0]
+	// match: (SRADconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
 	// match: (SRADconst [c] (MOVDconst [d]))
 	// result: (MOVDconst [d>>uint64(c)])
 	for {
@@ -17246,6 +17256,16 @@ func rewriteValueS390X_OpS390XSRAW(v *Value) bool {
 }
 func rewriteValueS390X_OpS390XSRAWconst(v *Value) bool {
 	v_0 := v.Args[0]
+	// match: (SRAWconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
 	// match: (SRAWconst [c] (MOVDconst [d]))
 	// result: (MOVDconst [int64(int32(d))>>uint64(c)])
 	for {
@@ -17416,6 +17436,16 @@ func rewriteValueS390X_OpS390XSRDconst(v *Value) bool {
 		v.AddArg(v0)
 		return true
 	}
+	// match: (SRDconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
 	return false
 }
 func rewriteValueS390X_OpS390XSRW(v *Value) bool {
@@ -17552,6 +17582,20 @@ func rewriteValueS390X_OpS390XSRW(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueS390X_OpS390XSRWconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SRWconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpS390XSTM2(v *Value) bool {
 	v_3 := v.Args[3]
 	v_2 := v.Args[2]
diff --git a/src/cmd/compile/internal/test/mulconst_test.go b/src/cmd/compile/internal/test/mulconst_test.go
new file mode 100644
index 0000000000..314cab32de
--- /dev/null
+++ b/src/cmd/compile/internal/test/mulconst_test.go
@@ -0,0 +1,242 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package test
+
+import "testing"
+
+// Benchmark multiplication of an integer by various constants.
+//
+// The comment above each sub-benchmark provides an example of how the
+// target multiplication operation might be implemented using shift
+// (multiplication by a power of 2), addition and subtraction
+// operations. It is platform-dependent whether these transformations
+// are actually applied.
+
+var (
+	mulSinkI32 int32
+	mulSinkI64 int64
+	mulSinkU32 uint32
+	mulSinkU64 uint64
+)
+
+func BenchmarkMulconstI32(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkI32 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkI32 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkI32 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkI32 = x
+	})
+	// -120x = 8x - 120x
+	b.Run("-120", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= -120
+		}
+		mulSinkI32 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkI32 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkI32 = x
+	})
+}
+
+func BenchmarkMulconstI64(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkI64 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkI64 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkI64 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkI64 = x
+	})
+	// -120x = 8x - 120x
+	b.Run("-120", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= -120
+		}
+		mulSinkI64 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkI64 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkI64 = x
+	})
+}
+
+func BenchmarkMulconstU32(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkU32 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkU32 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkU32 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkU32 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkU32 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkU32 = x
+	})
+}
+
+func BenchmarkMulconstU64(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkU64 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkU64 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkU64 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkU64 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkU64 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkU64 = x
+	})
+}
diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
index 8f25974376..9f30ec8ce4 100644
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -71,9 +71,15 @@ func Mul_96(n int) int {
 	// 386:`SHLL\t[$]5`,`LEAL\t\(.*\)\(.*\*2\),`,-`IMULL`
 	// arm64:`LSL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
 	// arm:`SLL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
+	// s390x:`SLD\t[$]5`,`SLD\t[$]6`,-`MULLD`
 	return n * 96
 }
 
+func Mul_n120(n int) int {
+	// s390x:`SLD\t[$]3`,`SLD\t[$]7`,-`MULLD`
+	return n * -120
+}
+
 func MulMemSrc(a []uint32, b []float32) {
 	// 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+`
 	a[0] *= a[1]
-- 
2.52.0