]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile: zero only low 128-bit of X15
authorCherry Mui <cherryyz@google.com>
Mon, 8 Dec 2025 17:14:24 +0000 (12:14 -0500)
committerCherry Mui <cherryyz@google.com>
Mon, 8 Dec 2025 22:10:09 +0000 (14:10 -0800)
Zeroing the upper part of X15 may make the CPU think it is
"dirty" and slow down SSE operations. For now, just not zeroing
the upper part, and construct a zero value on the fly if we need
a 256- or 512-bit zero value. Maybe VZEROUPPER works better than
explicitly zeroing X15, but we need to evaluate.

Long term, we probably want to move more things from SSE to AVX.

This essentially undoes CL 698237 and CL 698238, except keeping
using X15 for 128-bit zeroing for SIMD.

Change-Id: I1564e6332c4c57f9721397c92c7c734c5497534c
Reviewed-on: https://go-review.googlesource.com/c/go/+/728240
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
12 files changed:
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/runtime/asm_amd64.s
src/runtime/race_amd64.s
src/runtime/sys_darwin_amd64.s
src/runtime/sys_dragonfly_amd64.s
src/runtime/sys_freebsd_amd64.s
src/runtime/sys_linux_amd64.s
src/runtime/sys_netbsd_amd64.s
src/runtime/sys_openbsd_amd64.s
src/runtime/sys_windows_amd64.s

index 9a0fa27470a44a30af4bd52281e7a677c5d38c8c..5ddcb84c59560aba722e12bf736e9a7f59f5dbd0 100644 (file)
@@ -18,7 +18,6 @@ import (
        "cmd/internal/obj"
        "cmd/internal/obj/x86"
        "internal/abi"
-       "internal/buildcfg"
 )
 
 // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@@ -1718,7 +1717,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
        case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
                s.Prog(v.Op.Asm())
 
-       case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted
+       case ssa.OpAMD64Zero128: // no code emitted
+
+       case ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
+               p := s.Prog(v.Op.Asm())
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = simdReg(v)
+               p.AddRestSourceReg(simdReg(v))
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = simdReg(v)
 
        case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
                // These are for initializing the least 32/64 bits of a SIMD register from a "float".
@@ -1871,34 +1878,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 
 // zeroX15 zeroes the X15 register.
 func zeroX15(s *ssagen.State) {
-       if !buildcfg.Experiment.SIMD {
-               opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
-               return
-       }
-       vxorps := func(s *ssagen.State) {
-               p := s.Prog(x86.AVXORPS)
-               p.From.Type = obj.TYPE_REG
-               p.From.Reg = x86.REG_X15
-               p.AddRestSourceReg(x86.REG_X15)
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = x86.REG_X15
-       }
-       if buildcfg.GOAMD64 >= 3 {
-               vxorps(s)
-               return
-       }
-       // AVX may not be available, check before zeroing the high bits.
-       p := s.Prog(x86.ACMPB)
-       p.From.Type = obj.TYPE_MEM
-       p.From.Name = obj.NAME_EXTERN
-       p.From.Sym = ir.Syms.X86HasAVX
-       p.To.Type = obj.TYPE_CONST
-       p.To.Offset = 1
-       jmp := s.Prog(x86.AJNE)
-       jmp.To.Type = obj.TYPE_BRANCH
-       vxorps(s)
-       sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
-       jmp.To.SetTarget(sse)
+       opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
 }
 
 // Example instruction: VRSQRTPS X1, X1
index e77f55ab5e58aa63210f66c5579d3cef5cb90f35..2fb4fdfc966291d91e2fda7477ae7ea4838dfdc2 100644 (file)
@@ -214,6 +214,7 @@ func init() {
                vloadk  = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly}
                vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}
 
+               v01     = regInfo{inputs: nil, outputs: vonly}
                v11     = regInfo{inputs: vonly, outputs: vonly}            // used in resultInArg0 ops, arg0 must not be x15
                v21     = regInfo{inputs: []regMask{v, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
                vk      = regInfo{inputs: vzonly, outputs: maskonly}
@@ -232,6 +233,7 @@ func init() {
                gpv     = regInfo{inputs: []regMask{gp}, outputs: vonly}
                v2flags = regInfo{inputs: []regMask{vz, vz}}
 
+               w01   = regInfo{inputs: nil, outputs: wonly}
                w11   = regInfo{inputs: wonly, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
                w21   = regInfo{inputs: []regMask{wz, wz}, outputs: wonly}
                wk    = regInfo{inputs: wzonly, outputs: maskonly}
@@ -1398,12 +1400,15 @@ func init() {
                {name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"},
                {name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"},
 
+               // X15 is the zero register up to 128-bit. For larger values, we zero it on the fly.
                {name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
-               {name: "Zero256", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
-               {name: "Zero512", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
+               {name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"},
+               {name: "Zero512", argLength: 0, reg: w01, asm: "VPXORQ"},
 
+               // Move a 32/64 bit float to a 128-bit SIMD register.
                {name: "VMOVSDf2v", argLength: 1, reg: fpv, asm: "VMOVSD"},
                {name: "VMOVSSf2v", argLength: 1, reg: fpv, asm: "VMOVSS"},
+
                {name: "VMOVQ", argLength: 1, reg: gpv, asm: "VMOVQ"},
                {name: "VMOVD", argLength: 1, reg: gpv, asm: "VMOVD"},
 
index 83e7959218f673c1711738329847826e6ebc4453..00d581ec9af41dd8fb55db8d1de892c90231b25d 100644 (file)
@@ -20365,24 +20365,22 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:      "Zero256",
-               argLen:    0,
-               zeroWidth: true,
-               fixedReg:  true,
+               name:   "Zero256",
+               argLen: 0,
+               asm:    x86.AVPXOR,
                reg: regInfo{
                        outputs: []outputInfo{
-                               {0, 2147483648}, // X15
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
                        },
                },
        },
        {
-               name:      "Zero512",
-               argLen:    0,
-               zeroWidth: true,
-               fixedReg:  true,
+               name:   "Zero512",
+               argLen: 0,
+               asm:    x86.AVPXORQ,
                reg: regInfo{
                        outputs: []outputInfo{
-                               {0, 2147483648}, // X15
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
                        },
                },
        },
index bf208a4d2914a0d67616f77aaa6b378d44bbd2d8..391d9bcd22ded1f21030b8532e108551ddd4248f 100644 (file)
@@ -1093,11 +1093,6 @@ needm:
        // there's no need to handle that. Clear R14 so that there's
        // a bad value in there, in case needm tries to use it.
        XORPS   X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
        XORQ    R14, R14
        MOVQ    $runtime·needAndBindM<ABIInternal>(SB), AX
        CALL    AX
@@ -1795,11 +1790,6 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
        get_tls(R14)
        MOVQ    g(R14), R14
        XORPS   X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
        JMP     ·sigpanic<ABIInternal>(SB)
 
 // gcWriteBarrier informs the GC about heap pointer writes.
index ade29bc5f1f8fdd56320204e2759e54c122a3b9d..e19118bd54e6eebf906f548605a975b16b9cafdd 100644 (file)
@@ -456,11 +456,6 @@ call:
        // Back to Go world, set special registers.
        // The g register (R14) is preserved in C.
        XORPS   X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
        RET
 
 // C->Go callback thunk that allows to call runtime·racesymbolize from C code.
index e033e8b7021e047acdb9f7b7866f323a9d26c7ab..99d67a9cfd2d6e3a8aa5dac36c50e95a68b64ec2 100644 (file)
@@ -177,11 +177,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index e417d4b8a814f1c9d586ebabe4d8dcf6d8247b12..a223c2cf76bd1def59c8282c179a58b51c679963 100644 (file)
@@ -228,11 +228,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index bab275cc7262dd23faca149243d1096d213e56fd..977ea093d247ac367b3239ba3f2af492359bae1a 100644 (file)
@@ -265,11 +265,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
@@ -295,11 +290,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index 618553b1969fadde7a62f9a7f799aad746fe1b43..878f834748e943071df264cea0db3f62c7387108 100644 (file)
@@ -352,11 +352,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
@@ -382,11 +377,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index 946b1fbe22cc76f893cacaec5cc53cf446b9e9da..2f1ddcdc89755cd00f144d0b6dcd1d22a723ea8b 100644 (file)
@@ -310,11 +310,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index 7766fa5194ec47e579204179c48ae0f3e7272fb2..ff0bc2416aa4bcc225c72573f32bee1c62e87a98 100644 (file)
@@ -64,11 +64,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
        get_tls(R12)
        MOVQ    g(R12), R14
        PXOR    X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
 
        // Reserve space for spill slots.
        NOP     SP              // disable vet stack checking
index 52a21ba89bbdc92154d769a9216daae85c09d05d..e438599910f4ee06de9714f4b41d6581ffcacd80 100644 (file)
@@ -32,11 +32,6 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
        // R14 is cleared in case there's a non-zero value in there
        // if called from a non-go thread.
        XORPS   X15, X15
-#ifdef GOEXPERIMENT_simd
-       CMPB    internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
-       JNE     2(PC)
-       VXORPS  X15, X15, X15
-#endif
        XORQ    R14, R14
 
        get_tls(AX)