On AMD64, we reserve the X15 register as the zero register.
Currently we use an SSE instruction to zero it, and we only use
it in SSE contexts. When the machine supports AVX, the high bits
of the register is not necessarily zeroed.
Now that the compiler generates AVX code for SIMD, it would be
great to have a zero register in the AVX context. This CL zeroes
the whole X15 register if AVX is supported.
Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7
Reviewed-on: https://go-review.googlesource.com/c/go/+/698237
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: David Chase <drchase@google.com>
"cmd/internal/obj"
"cmd/internal/obj/x86"
"internal/abi"
+ "internal/buildcfg"
)
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
// zeroing X15 when entering ABIInternal from ABI0
- opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+ zeroX15(s)
// set G register from TLS
getgFromTLS(s, x86.REG_R14)
}
s.Call(v)
if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
// zeroing X15 when entering ABIInternal from ABI0
- opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+ zeroX15(s)
// set G register from TLS
getgFromTLS(s, x86.REG_R14)
}
}
}
+// zeroX15 zeroes the X15 register.
+func zeroX15(s *ssagen.State) {
+ vxorps := func(s *ssagen.State) {
+ p := s.Prog(x86.AVXORPS)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = x86.REG_X15
+ p.AddRestSourceReg(x86.REG_X15)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = x86.REG_X15
+ }
+ if buildcfg.GOAMD64 >= 3 {
+ vxorps(s)
+ return
+ }
+ // AVX may not be available, check before zeroing the high bits.
+ p := s.Prog(x86.ACMPB)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Name = obj.NAME_EXTERN
+ p.From.Sym = ir.Syms.X86HasAVX
+ p.To.Type = obj.TYPE_CONST
+ p.To.Offset = 1
+ jmp := s.Prog(x86.AJNE)
+ jmp.To.Type = obj.TYPE_BRANCH
+ vxorps(s)
+ sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+ jmp.To.SetTarget(sse)
+}
+
// Example instruction: VRSQRTPS X1, X1
func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
Loong64HasLAM_BH *obj.LSym
Loong64HasLSX *obj.LSym
RISCV64HasZbb *obj.LSym
+ X86HasAVX *obj.LSym
X86HasFMA *obj.LSym
X86HasPOPCNT *obj.LSym
X86HasSSE41 *obj.LSym
ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
+ ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
+ ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
- ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
}
var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
-
func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32
// architecture variants
+var x86HasAVX bool
+var x86HasFMA bool
var x86HasPOPCNT bool
var x86HasSSE41 bool
-var x86HasFMA bool
var armHasVFPv4 bool
var arm64HasATOMICS bool
var loong64HasLAMCAS bool
{"libfuzzerHookStrCmp", funcTag, 155},
{"libfuzzerHookEqualFold", funcTag, 155},
{"addCovMeta", funcTag, 157},
+ {"x86HasAVX", varTag, 6},
+ {"x86HasFMA", varTag, 6},
{"x86HasPOPCNT", varTag, 6},
{"x86HasSSE41", varTag, 6},
- {"x86HasFMA", varTag, 6},
{"armHasVFPv4", varTag, 6},
{"arm64HasATOMICS", varTag, 6},
{"loong64HasLAMCAS", varTag, 6},
// there's no need to handle that. Clear R14 so that there's
// a bad value in there, in case needm tries to use it.
XORPS X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
XORQ R14, R14
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
CALL AX
get_tls(R14)
MOVQ g(R14), R14
XORPS X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
JMP ·sigpanic<ABIInternal>(SB)
// gcWriteBarrier informs the GC about heap pointer writes.
var (
// Set in runtime.cpuinit.
// TODO: deprecate these; use internal/cpu directly.
+ x86HasAVX bool
+ x86HasFMA bool
x86HasPOPCNT bool
x86HasSSE41 bool
- x86HasFMA bool
armHasVFPv4 bool
// to guard execution of instructions that can not be assumed to be always supported.
switch GOARCH {
case "386", "amd64":
+ x86HasAVX = cpu.X86.HasAVX
+ x86HasFMA = cpu.X86.HasFMA
x86HasPOPCNT = cpu.X86.HasPOPCNT
x86HasSSE41 = cpu.X86.HasSSE41
- x86HasFMA = cpu.X86.HasFMA
case "arm":
armHasVFPv4 = cpu.ARM.HasVFPv4
// Back to Go world, set special registers.
// The g register (R14) is preserved in C.
XORPS X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
RET
// C->Go callback thunk that allows to call runtime·racesymbolize from C code.
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
// R14 is cleared in case there's a non-zero value in there
// if called from a non-go thread.
XORPS X15, X15
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+ JNE 2(PC)
+ VXORPS X15, X15, X15
XORQ R14, R14
get_tls(AX)