if reg == "SP" || reg == "BP" {
continue
}
- if strings.HasPrefix(reg, "X") {
- l.add("MOVUPS", reg, 16)
- } else {
+ if !strings.HasPrefix(reg, "X") {
l.add("MOVQ", reg, 8)
}
}
+ lSSE := layout{stack: l.stack, sp: "SP"}
+ for _, reg := range regNamesAMD64 {
+ if strings.HasPrefix(reg, "X") {
+ lSSE.add("MOVUPS", reg, 16)
+ }
+ }
// TODO: MXCSR register?
p("// Save flags before clobbering them")
p("PUSHFQ")
p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
- p("ADJSP $%d", l.stack)
+ p("ADJSP $%d", lSSE.stack)
p("// But vet doesn't know ADJSP, so suppress vet stack checking")
p("NOP SP")
+ l.save()
+
// Apparently, the signal handling code path in darwin kernel leaves
// the upper bits of Y registers in a dirty state, which causes
// many SSE operations (128-bit and narrower) become much slower.
p("VZEROUPPER")
p("#endif")
- l.save()
+ lSSE.save()
p("CALL ·asyncPreempt2(SB)")
+ lSSE.restore()
l.restore()
- p("ADJSP $%d", -l.stack)
+ p("ADJSP $%d", -lSSE.stack)
p("POPFQ")
p("POPQ BP")
p("RET")
ADJSP $368
// But vet doesn't know ADJSP, so suppress vet stack checking
NOP SP
- #ifdef GOOS_darwin
- CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
- JE 2(PC)
- VZEROUPPER
- #endif
MOVQ AX, 0(SP)
MOVQ CX, 8(SP)
MOVQ DX, 16(SP)
MOVQ R13, 88(SP)
MOVQ R14, 96(SP)
MOVQ R15, 104(SP)
+ #ifdef GOOS_darwin
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
+ JE 2(PC)
+ VZEROUPPER
+ #endif
MOVUPS X0, 112(SP)
MOVUPS X1, 128(SP)
MOVUPS X2, 144(SP)