From 478d86446e88dc9e0b46e08914cb564d7c705d1e Mon Sep 17 00:00:00 2001 From: Jorropo Date: Sat, 20 Dec 2025 14:11:35 +0100 Subject: [PATCH] cmd/compile: on amd64 use 32bits copies for 64bits copies of 32bits values Fixes #76449 This saves a single byte for the REX prefix per OpCopy it triggers on. Change-Id: I1eab364d07354555ba2f23ffd2f9c522d4a04bd0 Reviewed-on: https://go-review.googlesource.com/c/go/+/731640 Reviewed-by: Michael Pratt Reviewed-by: Carlos Amedee Reviewed-by: Keith Randall Auto-Submit: Jorropo LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/ssa.go | 19 +++++++++++++--- .../internal/ssa/_gen/AMD64latelower.rules | 6 ++--- .../internal/ssa/_gen/ARM64latelower.rules | 2 +- src/cmd/compile/internal/ssa/rewrite.go | 22 ++++++++++++------- .../internal/ssa/rewriteAMD64latelower.go | 12 +++++----- .../internal/ssa/rewriteARM64latelower.go | 4 ++-- test/codegen/constants.go | 9 ++++++++ 7 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index e9a566d759..381a91e228 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -43,6 +43,10 @@ func ssaMarkMoves(s *ssagen.State, b *ssa.Block) { } } +func isGPReg(r int16) bool { + return x86.REG_AL <= r && r <= x86.REG_R15 +} + func isFPReg(r int16) bool { return x86.REG_X0 <= r && r <= x86.REG_Z31 } @@ -1225,14 +1229,23 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { if v.Type.IsMemory() { return } - x := v.Args[0].Reg() + arg := v.Args[0] + x := arg.Reg() y := v.Reg() if v.Type.IsSIMD() { - x = simdOrMaskReg(v.Args[0]) + x = simdOrMaskReg(arg) y = simdOrMaskReg(v) } if x != y { - opregreg(s, moveByRegsWidth(y, x, v.Type.Size()), y, x) + width := v.Type.Size() + if width == 8 && isGPReg(y) && ssa.ZeroUpper32Bits(arg, 3) { + // The source was naturally zext-ed from 32 to 64 bits, + // but we are asked to do a full 64-bit copy. + // Save the REX prefix byte in I-CACHE by using a 32-bit move, + // since it zeroes the upper 32 bits anyway. + width = 4 + } + opregreg(s, moveByRegsWidth(y, x, width), y, x) } case ssa.OpLoadReg: if v.Type.IsFlags() { diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules b/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules index ead4ec45f1..9bdb5f8d80 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules @@ -8,6 +8,6 @@ (SHR(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SHRX(Q|L) x y) // See comments in ARM64latelower.rules for why these are here. -(MOVLQZX x) && zeroUpper32Bits(x,3) => x -(MOVWQZX x) && zeroUpper48Bits(x,3) => x -(MOVBQZX x) && zeroUpper56Bits(x,3) => x +(MOVLQZX x) && ZeroUpper32Bits(x,3) => x +(MOVWQZX x) && ZeroUpper48Bits(x,3) => x +(MOVBQZX x) && ZeroUpper56Bits(x,3) => x diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules b/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules index 8c43b960b9..7945a5454d 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules +++ b/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules @@ -29,7 +29,7 @@ (MOVBUreg x:((Equal|NotEqual|LessThan|LessThanU|LessThanF|LessEqual|LessEqualU|LessEqualF|GreaterThan|GreaterThanU|GreaterThanF|GreaterEqual|GreaterEqualU|GreaterEqualF) _)) => x // omit unsigned extension -(MOVWUreg x) && zeroUpper32Bits(x, 3) => x +(MOVWUreg x) && ZeroUpper32Bits(x, 3) => x // don't extend after proper load (MOVBreg x:(MOVBload _ _)) => (MOVDreg x) diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index b4e1a7fd33..4b13d65618 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -1351,7 +1351,7 @@ func overlap(offset1, size1, offset2, size2 int64) bool { // check if value zeroes out upper 32-bit of 64-bit register. // depth limits recursion depth. In AMD64.rules 3 is used as limit, // because it catches same amount of cases as 4. -func zeroUpper32Bits(x *Value, depth int) bool { +func ZeroUpper32Bits(x *Value, depth int) bool { if x.Type.IsSigned() && x.Type.Size() < 8 { // If the value is signed, it might get re-sign-extended // during spill and restore. See issue 68227. @@ -1368,6 +1368,8 @@ func zeroUpper32Bits(x *Value, depth int) bool { OpAMD64SHRL, OpAMD64SHRLconst, OpAMD64SARL, OpAMD64SARLconst, OpAMD64SHLL, OpAMD64SHLLconst: return true + case OpAMD64MOVQconst: + return uint64(uint32(x.AuxInt)) == uint64(x.AuxInt) case OpARM64REV16W, OpARM64REVW, OpARM64RBITW, OpARM64CLZW, OpARM64EXTRWconst, OpARM64MULW, OpARM64MNEGW, OpARM64UDIVW, OpARM64DIVW, OpARM64UMODW, OpARM64MADDW, OpARM64MSUBW, OpARM64RORW, OpARM64RORWconst: @@ -1383,7 +1385,7 @@ func zeroUpper32Bits(x *Value, depth int) bool { return false } for i := range x.Args { - if !zeroUpper32Bits(x.Args[i], depth-1) { + if !ZeroUpper32Bits(x.Args[i], depth-1) { return false } } @@ -1393,14 +1395,16 @@ func zeroUpper32Bits(x *Value, depth int) bool { return false } -// zeroUpper48Bits is similar to zeroUpper32Bits, but for upper 48 bits. -func zeroUpper48Bits(x *Value, depth int) bool { +// ZeroUpper48Bits is similar to ZeroUpper32Bits, but for upper 48 bits. +func ZeroUpper48Bits(x *Value, depth int) bool { if x.Type.IsSigned() && x.Type.Size() < 8 { return false } switch x.Op { case OpAMD64MOVWQZX, OpAMD64MOVWload, OpAMD64MOVWloadidx1, OpAMD64MOVWloadidx2: return true + case OpAMD64MOVQconst, OpAMD64MOVLconst: + return uint64(uint16(x.AuxInt)) == uint64(x.AuxInt) case OpArg: // note: but not ArgIntReg return x.Type.Size() == 2 && x.Block.Func.Config.arch == "amd64" case OpPhi, OpSelect0, OpSelect1: @@ -1410,7 +1414,7 @@ func zeroUpper48Bits(x *Value, depth int) bool { return false } for i := range x.Args { - if !zeroUpper48Bits(x.Args[i], depth-1) { + if !ZeroUpper48Bits(x.Args[i], depth-1) { return false } } @@ -1420,14 +1424,16 @@ func zeroUpper48Bits(x *Value, depth int) bool { return false } -// zeroUpper56Bits is similar to zeroUpper32Bits, but for upper 56 bits. -func zeroUpper56Bits(x *Value, depth int) bool { +// ZeroUpper56Bits is similar to ZeroUpper32Bits, but for upper 56 bits. +func ZeroUpper56Bits(x *Value, depth int) bool { if x.Type.IsSigned() && x.Type.Size() < 8 { return false } switch x.Op { case OpAMD64MOVBQZX, OpAMD64MOVBload, OpAMD64MOVBloadidx1: return true + case OpAMD64MOVQconst, OpAMD64MOVLconst: + return uint64(uint8(x.AuxInt)) == uint64(x.AuxInt) case OpArg: // note: but not ArgIntReg return x.Type.Size() == 1 && x.Block.Func.Config.arch == "amd64" case OpPhi, OpSelect0, OpSelect1: @@ -1437,7 +1443,7 @@ func zeroUpper56Bits(x *Value, depth int) bool { return false } for i := range x.Args { - if !zeroUpper56Bits(x.Args[i], depth-1) { + if !ZeroUpper56Bits(x.Args[i], depth-1) { return false } } diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64latelower.go b/src/cmd/compile/internal/ssa/rewriteAMD64latelower.go index 11ecb0b285..531fbe1dd0 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64latelower.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64latelower.go @@ -30,11 +30,11 @@ func rewriteValueAMD64latelower(v *Value) bool { func rewriteValueAMD64latelower_OpAMD64MOVBQZX(v *Value) bool { v_0 := v.Args[0] // match: (MOVBQZX x) - // cond: zeroUpper56Bits(x,3) + // cond: ZeroUpper56Bits(x,3) // result: x for { x := v_0 - if !(zeroUpper56Bits(x, 3)) { + if !(ZeroUpper56Bits(x, 3)) { break } v.copyOf(x) @@ -45,11 +45,11 @@ func rewriteValueAMD64latelower_OpAMD64MOVBQZX(v *Value) bool { func rewriteValueAMD64latelower_OpAMD64MOVLQZX(v *Value) bool { v_0 := v.Args[0] // match: (MOVLQZX x) - // cond: zeroUpper32Bits(x,3) + // cond: ZeroUpper32Bits(x,3) // result: x for { x := v_0 - if !(zeroUpper32Bits(x, 3)) { + if !(ZeroUpper32Bits(x, 3)) { break } v.copyOf(x) @@ -60,11 +60,11 @@ func rewriteValueAMD64latelower_OpAMD64MOVLQZX(v *Value) bool { func rewriteValueAMD64latelower_OpAMD64MOVWQZX(v *Value) bool { v_0 := v.Args[0] // match: (MOVWQZX x) - // cond: zeroUpper48Bits(x,3) + // cond: ZeroUpper48Bits(x,3) // result: x for { x := v_0 - if !(zeroUpper48Bits(x, 3)) { + if !(ZeroUpper48Bits(x, 3)) { break } v.copyOf(x) diff --git a/src/cmd/compile/internal/ssa/rewriteARM64latelower.go b/src/cmd/compile/internal/ssa/rewriteARM64latelower.go index 0fa5e26e93..43ddb34b30 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64latelower.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64latelower.go @@ -653,11 +653,11 @@ func rewriteValueARM64latelower_OpARM64MOVHreg(v *Value) bool { func rewriteValueARM64latelower_OpARM64MOVWUreg(v *Value) bool { v_0 := v.Args[0] // match: (MOVWUreg x) - // cond: zeroUpper32Bits(x, 3) + // cond: ZeroUpper32Bits(x, 3) // result: x for { x := v_0 - if !(zeroUpper32Bits(x, 3)) { + if !(ZeroUpper32Bits(x, 3)) { break } v.copyOf(x) diff --git a/test/codegen/constants.go b/test/codegen/constants.go index 178a106552..9b014b54b1 100644 --- a/test/codegen/constants.go +++ b/test/codegen/constants.go @@ -33,3 +33,12 @@ func contiguousMaskConstants() (out [64]uint64) { out[3] = 0xFFFFFFFE00000001 return } + +func issue76449_1() (_, _, _ uint64) { + // amd64:-"MOVQ" + return 0, 0, 0 +} +func issue76449_2() (_, _, _ uint64) { + // amd64:-"MOVQ" + return 1, 2, 1 +} -- 2.52.0