[dev.simd] cmd/compile: ensure the whole X15 register is zeroed

author Cherry Mui <cherryyz@google.com>

Thu, 21 Aug 2025 18:37:18 +0000 (14:37 -0400)

committer Cherry Mui <cherryyz@google.com>

Fri, 22 Aug 2025 21:57:00 +0000 (14:57 -0700)
author Cherry Mui <cherryyz@google.com>
Thu, 21 Aug 2025 18:37:18 +0000 (14:37 -0400)
committer Cherry Mui <cherryyz@google.com>
Fri, 22 Aug 2025 21:57:00 +0000 (14:57 -0700)
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go

index 3ae3c6176460fd566edfa6b060f2a7057ceb8e02..f511e75e972715716dde9293307005467e39ac1a 100644 (file)
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -18,6 +18,7 @@ import (
         "cmd/internal/obj"
         "cmd/internal/obj/x86"
         "internal/abi"
+       "internal/buildcfg"
  )
  
  // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
         case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
                 if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
                         // zeroing X15 when entering ABIInternal from ABI0
-                       opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+                       zeroX15(s)
                         // set G register from TLS
                         getgFromTLS(s, x86.REG_R14)
                 }
@@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                 s.Call(v)
                 if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
                         // zeroing X15 when entering ABIInternal from ABI0
-                       opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+                       zeroX15(s)
                         // set G register from TLS
                         getgFromTLS(s, x86.REG_R14)
                 }
@@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
         }
  }
  
+// zeroX15 zeroes the X15 register.
+func zeroX15(s *ssagen.State) {
+       vxorps := func(s *ssagen.State) {
+               p := s.Prog(x86.AVXORPS)
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = x86.REG_X15
+               p.AddRestSourceReg(x86.REG_X15)
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = x86.REG_X15
+       }
+       if buildcfg.GOAMD64 >= 3 {
+               vxorps(s)
+               return
+       }
+       // AVX may not be available, check before zeroing the high bits.
+       p := s.Prog(x86.ACMPB)
+       p.From.Type = obj.TYPE_MEM
+       p.From.Name = obj.NAME_EXTERN
+       p.From.Sym = ir.Syms.X86HasAVX
+       p.To.Type = obj.TYPE_CONST
+       p.To.Offset = 1
+       jmp := s.Prog(x86.AJNE)
+       jmp.To.Type = obj.TYPE_BRANCH
+       vxorps(s)
+       sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+       jmp.To.SetTarget(sse)
+}
+
  // Example instruction: VRSQRTPS X1, X1
  func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
         p := s.Prog(v.Op.Asm())
diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go

index ee0f52fbf3f3b8578df4da8610e3f748f70f7a99..2222a5444aa12f314e16b15dbeea1c39250b3221 100644 (file)
--- a/src/cmd/compile/internal/ir/symtab.go
+++ b/src/cmd/compile/internal/ir/symtab.go
@@ -68,6 +68,7 @@ type symsStruct struct {
         Loong64HasLAM_BH *obj.LSym
         Loong64HasLSX    *obj.LSym
         RISCV64HasZbb    *obj.LSym
+       X86HasAVX        *obj.LSym
         X86HasFMA        *obj.LSym
         X86HasPOPCNT     *obj.LSym
         X86HasSSE41      *obj.LSym
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go

index abb6370a15f743f7df0c51fb0643cfb135375931..57129817f6c878a5cd2c49343d9aca67177061fb 100644 (file)
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -150,9 +150,10 @@ func InitConfig() {
         ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
         ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
         ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
+       ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX")               // bool
+       ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA")               // bool
         ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT")         // bool
         ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41")           // bool
-       ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA")               // bool
         ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4")           // bool
         ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS")   // bool
         ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
@@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool {
  }
  
  var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
-
diff --git a/src/cmd/compile/internal/typecheck/_builtin/runtime.go b/src/cmd/compile/internal/typecheck/_builtin/runtime.go

index 296bfdc281d4d5d83c9a81bc9e49fd7ee31b717b..1e4d0b7db6ec50aac21e4bb85bca6beef6e1a0ad 100644 (file)
--- a/src/cmd/compile/internal/typecheck/_builtin/runtime.go
+++ b/src/cmd/compile/internal/typecheck/_builtin/runtime.go
@@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint)
  func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32
  
  // architecture variants
+var x86HasAVX bool
+var x86HasFMA bool
  var x86HasPOPCNT bool
  var x86HasSSE41 bool
-var x86HasFMA bool
  var armHasVFPv4 bool
  var arm64HasATOMICS bool
  var loong64HasLAMCAS bool
diff --git a/src/cmd/compile/internal/typecheck/builtin.go b/src/cmd/compile/internal/typecheck/builtin.go

index 535f0fb7e88851e764f53f81c62d7ea52b16d3eb..6b8c6d7bad5d0fd5b8f373eb4d950f02266a212e 100644 (file)
--- a/src/cmd/compile/internal/typecheck/builtin.go
+++ b/src/cmd/compile/internal/typecheck/builtin.go
@@ -232,9 +232,10 @@ var runtimeDecls = [...]struct {
         {"libfuzzerHookStrCmp", funcTag, 155},
         {"libfuzzerHookEqualFold", funcTag, 155},
         {"addCovMeta", funcTag, 157},
+       {"x86HasAVX", varTag, 6},
+       {"x86HasFMA", varTag, 6},
         {"x86HasPOPCNT", varTag, 6},
         {"x86HasSSE41", varTag, 6},
-       {"x86HasFMA", varTag, 6},
         {"armHasVFPv4", varTag, 6},
         {"arm64HasATOMICS", varTag, 6},
         {"loong64HasLAMCAS", varTag, 6},
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s

index cf1d49a4ad82d4b4c1e491c8d7fd663e3b35f5a3..f8ebd030b614a9855bc03097744f8ac4897db7dd 100644 (file)
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1015,6 +1015,9 @@ needm:
         // there's no need to handle that. Clear R14 so that there's
         // a bad value in there, in case needm tries to use it.
         XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
         XORQ    R14, R14
         MOVQ    $runtime·needAndBindM<ABIInternal>(SB), AX
         CALL    AX
@@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
         get_tls(R14)
         MOVQ    g(R14), R14
         XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
         JMP     ·sigpanic<ABIInternal>(SB)
  
  // gcWriteBarrier informs the GC about heap pointer writes.
diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go

index 6452364b68ec3281dbed47c280865f2cbc9c40bb..67ed081ef6d180adf39f82927a5fd94c95e70fa6 100644 (file)
--- a/src/runtime/cpuflags.go
+++ b/src/runtime/cpuflags.go
@@ -28,9 +28,10 @@ const (
  var (
         // Set in runtime.cpuinit.
         // TODO: deprecate these; use internal/cpu directly.
+       x86HasAVX    bool
+       x86HasFMA    bool
         x86HasPOPCNT bool
         x86HasSSE41  bool
-       x86HasFMA    bool
  
         armHasVFPv4 bool
  
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 68647d771fe9528f87261c50ff0686f2a12900e0..1d597d59c2f0ecee45aad42827627a2c529aca2e 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -766,9 +766,10 @@ func cpuinit(env string) {
         // to guard execution of instructions that can not be assumed to be always supported.
         switch GOARCH {
         case "386", "amd64":
+               x86HasAVX = cpu.X86.HasAVX
+               x86HasFMA = cpu.X86.HasFMA
                 x86HasPOPCNT = cpu.X86.HasPOPCNT
                 x86HasSSE41 = cpu.X86.HasSSE41
-               x86HasFMA = cpu.X86.HasFMA
  
         case "arm":
                 armHasVFPv4 = cpu.ARM.HasVFPv4
diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s

index e19118bd54e6eebf906f548605a975b16b9cafdd..23f2e59e3d4f8b1a31f6cb136e749d5f5e1d0581 100644 (file)
--- a/src/runtime/race_amd64.s
+++ b/src/runtime/race_amd64.s
@@ -456,6 +456,9 @@ call:
         // Back to Go world, set special registers.
         // The g register (R14) is preserved in C.
         XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
         RET
  
  // C->Go callback thunk that allows to call runtime·racesymbolize from C code.
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s

index cc4e52d305ab5908e57172a6ecd7143b202c7acd..0091546f20452c4672a53fd284de142a3491f4bd 100644 (file)
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s

index a223c2cf76bd1def59c8282c179a58b51c679963..84bf326aad32cbe07a108649755971a0fa2bf17f 100644 (file)
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s
@@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s

index 977ea093d247ac367b3239ba3f2af492359bae1a..a1fa3a6fa29c96d0b3f1bdb351a24ada0674143f 100644 (file)
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s
@@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
@@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s

index 941f70b0e8e652659206397743154203372cb248..02505c2fb0aac29cab934db344f79b6ada38358f 100644 (file)
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
@@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s

index 2f1ddcdc89755cd00f144d0b6dcd1d22a723ea8b..edc7f3d6ee062a906f3d4308b8ab02a03c8b1ca8 100644 (file)
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s
@@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s

index ff0bc2416aa4bcc225c72573f32bee1c62e87a98..734dfe6478e852a901271d8a6fc4a5b7b4e5c7dd 100644 (file)
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s
@@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
         get_tls(R12)
         MOVQ    g(R12), R14
         PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
  
         // Reserve space for spill slots.
         NOP     SP              // disable vet stack checking
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s

index e438599910f4ee06de9714f4b41d6581ffcacd80..b0b4d3cce65688a805de8e100365a480fd752780 100644 (file)
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
         // R14 is cleared in case there's a non-zero value in there
         // if called from a non-go thread.
         XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
         XORQ    R14, R14
  
         get_tls(AX)
author	Cherry Mui <cherryyz@google.com>
	Thu, 21 Aug 2025 18:37:18 +0000 (14:37 -0400)
committer	Cherry Mui <cherryyz@google.com>
	Fri, 22 Aug 2025 21:57:00 +0000 (14:57 -0700)
src/cmd/compile/internal/amd64/ssa.go		patch \| blob \| history
src/cmd/compile/internal/ir/symtab.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/ssa.go		patch \| blob \| history
src/cmd/compile/internal/typecheck/_builtin/runtime.go		patch \| blob \| history
src/cmd/compile/internal/typecheck/builtin.go		patch \| blob \| history
src/runtime/asm_amd64.s		patch \| blob \| history
src/runtime/cpuflags.go		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/race_amd64.s		patch \| blob \| history
src/runtime/sys_darwin_amd64.s		patch \| blob \| history
src/runtime/sys_dragonfly_amd64.s		patch \| blob \| history
src/runtime/sys_freebsd_amd64.s		patch \| blob \| history
src/runtime/sys_linux_amd64.s		patch \| blob \| history
src/runtime/sys_netbsd_amd64.s		patch \| blob \| history
src/runtime/sys_openbsd_amd64.s		patch \| blob \| history
src/runtime/sys_windows_amd64.s		patch \| blob \| history