From 4c311aa38f6e354ec4d9f5882a16c36a2e4b0f36 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Thu, 21 Aug 2025 14:37:18 -0400 Subject: [PATCH] [dev.simd] cmd/compile: ensure the whole X15 register is zeroed On AMD64, we reserve the X15 register as the zero register. Currently we use an SSE instruction to zero it, and we only use it in SSE contexts. When the machine supports AVX, the high bits of the register is not necessarily zeroed. Now that the compiler generates AVX code for SIMD, it would be great to have a zero register in the AVX context. This CL zeroes the whole X15 register if AVX is supported. Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7 Reviewed-on: https://go-review.googlesource.com/c/go/+/698237 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/ssa.go | 33 +++++++++++++++++-- src/cmd/compile/internal/ir/symtab.go | 1 + src/cmd/compile/internal/ssagen/ssa.go | 4 +-- .../internal/typecheck/_builtin/runtime.go | 3 +- src/cmd/compile/internal/typecheck/builtin.go | 3 +- src/runtime/asm_amd64.s | 6 ++++ src/runtime/cpuflags.go | 3 +- src/runtime/proc.go | 3 +- src/runtime/race_amd64.s | 3 ++ src/runtime/sys_darwin_amd64.s | 3 ++ src/runtime/sys_dragonfly_amd64.s | 3 ++ src/runtime/sys_freebsd_amd64.s | 6 ++++ src/runtime/sys_linux_amd64.s | 6 ++++ src/runtime/sys_netbsd_amd64.s | 3 ++ src/runtime/sys_openbsd_amd64.s | 3 ++ src/runtime/sys_windows_amd64.s | 3 ++ 16 files changed, 78 insertions(+), 8 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 3ae3c61764..f511e75e97 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -18,6 +18,7 @@ import ( "cmd/internal/obj" "cmd/internal/obj/x86" "internal/abi" + "internal/buildcfg" ) // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. @@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail: if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal { // zeroing X15 when entering ABIInternal from ABI0 - opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + zeroX15(s) // set G register from TLS getgFromTLS(s, x86.REG_R14) } @@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { s.Call(v) if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 { // zeroing X15 when entering ABIInternal from ABI0 - opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + zeroX15(s) // set G register from TLS getgFromTLS(s, x86.REG_R14) } @@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { } } +// zeroX15 zeroes the X15 register. +func zeroX15(s *ssagen.State) { + vxorps := func(s *ssagen.State) { + p := s.Prog(x86.AVXORPS) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_X15 + p.AddRestSourceReg(x86.REG_X15) + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_X15 + } + if buildcfg.GOAMD64 >= 3 { + vxorps(s) + return + } + // AVX may not be available, check before zeroing the high bits. + p := s.Prog(x86.ACMPB) + p.From.Type = obj.TYPE_MEM + p.From.Name = obj.NAME_EXTERN + p.From.Sym = ir.Syms.X86HasAVX + p.To.Type = obj.TYPE_CONST + p.To.Offset = 1 + jmp := s.Prog(x86.AJNE) + jmp.To.Type = obj.TYPE_BRANCH + vxorps(s) + sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + jmp.To.SetTarget(sse) +} + // Example instruction: VRSQRTPS X1, X1 func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog { p := s.Prog(v.Op.Asm()) diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go index ee0f52fbf3..2222a5444a 100644 --- a/src/cmd/compile/internal/ir/symtab.go +++ b/src/cmd/compile/internal/ir/symtab.go @@ -68,6 +68,7 @@ type symsStruct struct { Loong64HasLAM_BH *obj.LSym Loong64HasLSX *obj.LSym RISCV64HasZbb *obj.LSym + X86HasAVX *obj.LSym X86HasFMA *obj.LSym X86HasPOPCNT *obj.LSym X86HasSSE41 *obj.LSym diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index abb6370a15..57129817f6 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -150,9 +150,10 @@ func InitConfig() { ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert") ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero") ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove") + ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool + ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool - ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool @@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool { } var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym - diff --git a/src/cmd/compile/internal/typecheck/_builtin/runtime.go b/src/cmd/compile/internal/typecheck/_builtin/runtime.go index 296bfdc281..1e4d0b7db6 100644 --- a/src/cmd/compile/internal/typecheck/_builtin/runtime.go +++ b/src/cmd/compile/internal/typecheck/_builtin/runtime.go @@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint) func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32 // architecture variants +var x86HasAVX bool +var x86HasFMA bool var x86HasPOPCNT bool var x86HasSSE41 bool -var x86HasFMA bool var armHasVFPv4 bool var arm64HasATOMICS bool var loong64HasLAMCAS bool diff --git a/src/cmd/compile/internal/typecheck/builtin.go b/src/cmd/compile/internal/typecheck/builtin.go index 535f0fb7e8..6b8c6d7bad 100644 --- a/src/cmd/compile/internal/typecheck/builtin.go +++ b/src/cmd/compile/internal/typecheck/builtin.go @@ -232,9 +232,10 @@ var runtimeDecls = [...]struct { {"libfuzzerHookStrCmp", funcTag, 155}, {"libfuzzerHookEqualFold", funcTag, 155}, {"addCovMeta", funcTag, 157}, + {"x86HasAVX", varTag, 6}, + {"x86HasFMA", varTag, 6}, {"x86HasPOPCNT", varTag, 6}, {"x86HasSSE41", varTag, 6}, - {"x86HasFMA", varTag, 6}, {"armHasVFPv4", varTag, 6}, {"arm64HasATOMICS", varTag, 6}, {"loong64HasLAMCAS", varTag, 6}, diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index cf1d49a4ad..f8ebd030b6 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1015,6 +1015,9 @@ needm: // there's no need to handle that. Clear R14 so that there's // a bad value in there, in case needm tries to use it. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 XORQ R14, R14 MOVQ $runtime·needAndBindM(SB), AX CALL AX @@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0 get_tls(R14) MOVQ g(R14), R14 XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 JMP ·sigpanic(SB) // gcWriteBarrier informs the GC about heap pointer writes. diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go index 6452364b68..67ed081ef6 100644 --- a/src/runtime/cpuflags.go +++ b/src/runtime/cpuflags.go @@ -28,9 +28,10 @@ const ( var ( // Set in runtime.cpuinit. // TODO: deprecate these; use internal/cpu directly. + x86HasAVX bool + x86HasFMA bool x86HasPOPCNT bool x86HasSSE41 bool - x86HasFMA bool armHasVFPv4 bool diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 68647d771f..1d597d59c2 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -766,9 +766,10 @@ func cpuinit(env string) { // to guard execution of instructions that can not be assumed to be always supported. switch GOARCH { case "386", "amd64": + x86HasAVX = cpu.X86.HasAVX + x86HasFMA = cpu.X86.HasFMA x86HasPOPCNT = cpu.X86.HasPOPCNT x86HasSSE41 = cpu.X86.HasSSE41 - x86HasFMA = cpu.X86.HasFMA case "arm": armHasVFPv4 = cpu.ARM.HasVFPv4 diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s index e19118bd54..23f2e59e3d 100644 --- a/src/runtime/race_amd64.s +++ b/src/runtime/race_amd64.s @@ -456,6 +456,9 @@ call: // Back to Go world, set special registers. // The g register (R14) is preserved in C. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 RET // C->Go callback thunk that allows to call runtime·racesymbolize from C code. diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s index cc4e52d305..0091546f20 100644 --- a/src/runtime/sys_darwin_amd64.s +++ b/src/runtime/sys_darwin_amd64.s @@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s index a223c2cf76..84bf326aad 100644 --- a/src/runtime/sys_dragonfly_amd64.s +++ b/src/runtime/sys_dragonfly_amd64.s @@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s index 977ea093d2..a1fa3a6fa2 100644 --- a/src/runtime/sys_freebsd_amd64.s +++ b/src/runtime/sys_freebsd_amd64.s @@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 941f70b0e8..02505c2fb0 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s index 2f1ddcdc89..edc7f3d6ee 100644 --- a/src/runtime/sys_netbsd_amd64.s +++ b/src/runtime/sys_netbsd_amd64.s @@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s index ff0bc2416a..734dfe6478 100644 --- a/src/runtime/sys_openbsd_amd64.s +++ b/src/runtime/sys_openbsd_amd64.s @@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s index e438599910..b0b4d3cce6 100644 --- a/src/runtime/sys_windows_amd64.s +++ b/src/runtime/sys_windows_amd64.s @@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0 // R14 is cleared in case there's a non-zero value in there // if called from a non-go thread. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 XORQ R14, R14 get_tls(AX) -- 2.52.0