]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile: ensure the whole X15 register is zeroed
authorCherry Mui <cherryyz@google.com>
Thu, 21 Aug 2025 18:37:18 +0000 (14:37 -0400)
committerCherry Mui <cherryyz@google.com>
Fri, 22 Aug 2025 21:57:00 +0000 (14:57 -0700)
On AMD64, we reserve the X15 register as the zero register.
Currently we use an SSE instruction to zero it, and we only use
it in SSE contexts. When the machine supports AVX, the high bits
of the register is not necessarily zeroed.

Now that the compiler generates AVX code for SIMD, it would be
great to have a zero register in the AVX context. This CL zeroes
the whole X15 register if AVX is supported.

Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7
Reviewed-on: https://go-review.googlesource.com/c/go/+/698237
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: David Chase <drchase@google.com>
16 files changed:
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/ir/symtab.go
src/cmd/compile/internal/ssagen/ssa.go
src/cmd/compile/internal/typecheck/_builtin/runtime.go
src/cmd/compile/internal/typecheck/builtin.go
src/runtime/asm_amd64.s
src/runtime/cpuflags.go
src/runtime/proc.go
src/runtime/race_amd64.s
src/runtime/sys_darwin_amd64.s
src/runtime/sys_dragonfly_amd64.s
src/runtime/sys_freebsd_amd64.s
src/runtime/sys_linux_amd64.s
src/runtime/sys_netbsd_amd64.s
src/runtime/sys_openbsd_amd64.s
src/runtime/sys_windows_amd64.s

index 3ae3c6176460fd566edfa6b060f2a7057ceb8e02..f511e75e972715716dde9293307005467e39ac1a 100644 (file)
@@ -18,6 +18,7 @@ import (
        "cmd/internal/obj"
        "cmd/internal/obj/x86"
        "internal/abi"
+       "internal/buildcfg"
 )
 
 // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
        case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
                if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
                        // zeroing X15 when entering ABIInternal from ABI0
-                       opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+                       zeroX15(s)
                        // set G register from TLS
                        getgFromTLS(s, x86.REG_R14)
                }
@@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                s.Call(v)
                if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
                        // zeroing X15 when entering ABIInternal from ABI0
-                       opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+                       zeroX15(s)
                        // set G register from TLS
                        getgFromTLS(s, x86.REG_R14)
                }
@@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
        }
 }
 
+// zeroX15 zeroes the X15 register.
+func zeroX15(s *ssagen.State) {
+       vxorps := func(s *ssagen.State) {
+               p := s.Prog(x86.AVXORPS)
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = x86.REG_X15
+               p.AddRestSourceReg(x86.REG_X15)
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = x86.REG_X15
+       }
+       if buildcfg.GOAMD64 >= 3 {
+               vxorps(s)
+               return
+       }
+       // AVX may not be available, check before zeroing the high bits.
+       p := s.Prog(x86.ACMPB)
+       p.From.Type = obj.TYPE_MEM
+       p.From.Name = obj.NAME_EXTERN
+       p.From.Sym = ir.Syms.X86HasAVX
+       p.To.Type = obj.TYPE_CONST
+       p.To.Offset = 1
+       jmp := s.Prog(x86.AJNE)
+       jmp.To.Type = obj.TYPE_BRANCH
+       vxorps(s)
+       sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+       jmp.To.SetTarget(sse)
+}
+
 // Example instruction: VRSQRTPS X1, X1
 func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
        p := s.Prog(v.Op.Asm())
index ee0f52fbf3f3b8578df4da8610e3f748f70f7a99..2222a5444aa12f314e16b15dbeea1c39250b3221 100644 (file)
@@ -68,6 +68,7 @@ type symsStruct struct {
        Loong64HasLAM_BH *obj.LSym
        Loong64HasLSX    *obj.LSym
        RISCV64HasZbb    *obj.LSym
+       X86HasAVX        *obj.LSym
        X86HasFMA        *obj.LSym
        X86HasPOPCNT     *obj.LSym
        X86HasSSE41      *obj.LSym
index abb6370a15f743f7df0c51fb0643cfb135375931..57129817f6c878a5cd2c49343d9aca67177061fb 100644 (file)
@@ -150,9 +150,10 @@ func InitConfig() {
        ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
        ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
        ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
+       ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX")               // bool
+       ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA")               // bool
        ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT")         // bool
        ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41")           // bool
-       ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA")               // bool
        ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4")           // bool
        ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS")   // bool
        ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
@@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool {
 }
 
 var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
-
index 296bfdc281d4d5d83c9a81bc9e49fd7ee31b717b..1e4d0b7db6ec50aac21e4bb85bca6beef6e1a0ad 100644 (file)
@@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint)
 func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32
 
 // architecture variants
+var x86HasAVX bool
+var x86HasFMA bool
 var x86HasPOPCNT bool
 var x86HasSSE41 bool
-var x86HasFMA bool
 var armHasVFPv4 bool
 var arm64HasATOMICS bool
 var loong64HasLAMCAS bool
index 535f0fb7e88851e764f53f81c62d7ea52b16d3eb..6b8c6d7bad5d0fd5b8f373eb4d950f02266a212e 100644 (file)
@@ -232,9 +232,10 @@ var runtimeDecls = [...]struct {
        {"libfuzzerHookStrCmp", funcTag, 155},
        {"libfuzzerHookEqualFold", funcTag, 155},
        {"addCovMeta", funcTag, 157},
+       {"x86HasAVX", varTag, 6},
+       {"x86HasFMA", varTag, 6},
        {"x86HasPOPCNT", varTag, 6},
        {"x86HasSSE41", varTag, 6},
-       {"x86HasFMA", varTag, 6},
        {"armHasVFPv4", varTag, 6},
        {"arm64HasATOMICS", varTag, 6},
        {"loong64HasLAMCAS", varTag, 6},
index cf1d49a4ad82d4b4c1e491c8d7fd663e3b35f5a3..f8ebd030b614a9855bc03097744f8ac4897db7dd 100644 (file)
@@ -1015,6 +1015,9 @@ needm:
        // there's no need to handle that. Clear R14 so that there's
        // a bad value in there, in case needm tries to use it.
        XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
        XORQ    R14, R14
        MOVQ    $runtime·needAndBindM<ABIInternal>(SB), AX
        CALL    AX
@@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
        get_tls(R14)
        MOVQ    g(R14), R14
        XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
        JMP     ·sigpanic<ABIInternal>(SB)
 
 // gcWriteBarrier informs the GC about heap pointer writes.
index 6452364b68ec3281dbed47c280865f2cbc9c40bb..67ed081ef6d180adf39f82927a5fd94c95e70fa6 100644 (file)
@@ -28,9 +28,10 @@ const (
 var (
        // Set in runtime.cpuinit.
        // TODO: deprecate these; use internal/cpu directly.
+       x86HasAVX    bool
+       x86HasFMA    bool
        x86HasPOPCNT bool
        x86HasSSE41  bool
-       x86HasFMA    bool
 
        armHasVFPv4 bool
 
index 68647d771fe9528f87261c50ff0686f2a12900e0..1d597d59c2f0ecee45aad42827627a2c529aca2e 100644 (file)
@@ -766,9 +766,10 @@ func cpuinit(env string) {
        // to guard execution of instructions that can not be assumed to be always supported.
        switch GOARCH {
        case "386", "amd64":
+               x86HasAVX = cpu.X86.HasAVX
+               x86HasFMA = cpu.X86.HasFMA
                x86HasPOPCNT = cpu.X86.HasPOPCNT
                x86HasSSE41 = cpu.X86.HasSSE41
-               x86HasFMA = cpu.X86.HasFMA
 
        case "arm":
                armHasVFPv4 = cpu.ARM.HasVFPv4
index e19118bd54e6eebf906f548605a975b16b9cafdd..23f2e59e3d4f8b1a31f6cb136e749d5f5e1d0581 100644 (file)
@@ -456,6 +456,9 @@ call:
        // Back to Go world, set special registers.
        // The g register (R14) is preserved in C.
        XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
        RET
 
 // C->Go callback thunk that allows to call runtime·racesymbolize from C code.
index cc4e52d305ab5908e57172a6ecd7143b202c7acd..0091546f20452c4672a53fd284de142a3491f4bd 100644 (file)
@@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index a223c2cf76bd1def59c8282c179a58b51c679963..84bf326aad32cbe07a108649755971a0fa2bf17f 100644 (file)
@@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index 977ea093d247ac367b3239ba3f2af492359bae1a..a1fa3a6fa29c96d0b3f1bdb351a24ada0674143f 100644 (file)
@@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
@@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index 941f70b0e8e652659206397743154203372cb248..02505c2fb0aac29cab934db344f79b6ada38358f 100644 (file)
@@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
@@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index 2f1ddcdc89755cd00f144d0b6dcd1d22a723ea8b..edc7f3d6ee062a906f3d4308b8ab02a03c8b1ca8 100644 (file)
@@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index ff0bc2416aa4bcc225c72573f32bee1c62e87a98..734dfe6478e852a901271d8a6fc4a5b7b4e5c7dd 100644 (file)
@@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index e438599910f4ee06de9714f4b41d6581ffcacd80..b0b4d3cce65688a805de8e100365a480fd752780 100644 (file)
@@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
        // R14 is cleared in case there's a non-zero value in there
        // if called from a non-go thread.
        XORPS   X15, X15
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+       JNE     2(PC)
+       VXORPS  X15, X15, X15
        XORQ    R14, R14
 
        get_tls(AX)