This CL redesign how we get the TLS pointer on windows/i386.
It applies the same changes as done in CL 431775 for windows/amd64.
We were previously reading it from the [TEB] arbitrary data slot,
located at 0x14(FS), which can only hold 1 TLS pointer.
With this CL, we will read the TLS pointer from the TEB TLS slot array,
located at 0xE10(GS). The TLS slot array can hold multiple
TLS pointers, up to 64, so multiple Go runtimes running on the
same thread can coexists with different TLS.
Each new TLS slot has to be allocated via [TlsAlloc],
which returns the slot index. This index can then be used to get the
slot offset from GS with the following formula: 0xE10 + index*4.
The slot index is fixed per Go runtime, so we can store it
in runtime.tls_g and use it latter on to read/update the TLS pointer.
Loading the TLS pointer requires the following asm instructions:
MOVQ runtime.tls_g, AX
MOVQ AX(FS), AX
Notice that this approach will now be implemented in all the supported
windows arches.
[TEB]: https://en.wikipedia.org/wiki/Win32_Thread_Information_Block
[TlsAlloc]: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-tlsalloc
Change-Id: If4550b0d44694ee6480d4093b851f4991a088b32
Reviewed-on: https://go-review.googlesource.com/c/go/+/454675
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Quim Muntal <quimmuntal@gmail.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
}
}
- if ctxt.Arch.Family == sys.I386 {
- if a.Index == REG_TLS && ctxt.Flag_shared {
- // When building for inclusion into a shared library, an instruction of the form
- // MOVL off(CX)(TLS*1), AX
- // becomes
- // mov %gs:off(%ecx), %eax
- // which assumes that the correct TLS offset has been loaded into %ecx (today
- // there is only one TLS variable -- g -- so this is OK). When not building for
- // a shared library the instruction it becomes
- // mov 0x0(%ecx), %eax
- // and a R_TLS_LE relocation, and so does not require a prefix.
- return 0x65 // GS
- }
- return 0
- }
-
switch a.Index {
case REG_CS:
return 0x2e
// When building for inclusion into a shared library, an instruction of the form
// MOV off(CX)(TLS*1), AX
// becomes
- // mov %fs:off(%rcx), %rax
- // which assumes that the correct TLS offset has been loaded into %rcx (today
+ // mov %gs:off(%ecx), %eax // on i386
+ // mov %fs:off(%rcx), %rax // on amd64
+ // which assumes that the correct TLS offset has been loaded into CX (today
// there is only one TLS variable -- g -- so this is OK). When not building for
- // a shared library the instruction does not require a prefix.
- return 0x64
+ // a shared library the instruction it becomes
+ // mov 0x0(%ecx), %eax // on i386
+ // mov 0x0(%rcx), %rax // on amd64
+ // and a R_TLS_LE relocation, and so does not require a prefix.
+ if ctxt.Arch.Family == sys.I386 {
+ return 0x65 // GS
+ }
+ return 0x64 // FS
}
case REG_FS:
if REG_AX <= base && base <= REG_R15 {
if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid &&
- !(ctxt.Headtype == objabi.Hwindows && ctxt.Arch.Family == sys.AMD64) {
+ ctxt.Headtype != objabi.Hwindows {
rel = obj.Reloc{}
rel.Type = objabi.R_TLS_LE
rel.Siz = 4
pp.From.Index = REG_NONE
ab.Put1(0x8B)
ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
-
- case objabi.Hwindows:
- // Windows TLS base is always 0x14(FS).
- pp.From = p.From
-
- pp.From.Type = obj.TYPE_MEM
- pp.From.Reg = REG_FS
- pp.From.Offset = 0x14
- pp.From.Index = REG_NONE
- pp.From.Scale = 0
- ab.Put2(0x64, // FS
- 0x8B)
- ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
}
break
}
}
}
- // Android and Win64 use a tls offset determined at runtime. Rewrite
+ // Android and Windows use a tls offset determined at runtime. Rewrite
// MOVQ TLS, BX
// to
// MOVQ runtime.tls_g(SB), BX
- if (isAndroid || (ctxt.Headtype == objabi.Hwindows && ctxt.Arch.Family == sys.AMD64)) &&
+ if (isAndroid || ctxt.Headtype == objabi.Hwindows) &&
(p.As == AMOVQ || p.As == AMOVL) && p.From.Type == obj.TYPE_REG && p.From.Reg == REG_TLS && p.To.Type == obj.TYPE_REG && REG_AX <= p.To.Reg && p.To.Reg <= REG_R15 {
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_EXTERN
p.From.Sym = ctxt.Lookup("runtime.tls_g")
p.From.Index = REG_NONE
if ctxt.Headtype == objabi.Hwindows {
- // Win64 requires an additional indirection
+ // Windows requires an additional indirection
// to retrieve the TLS pointer,
- // as runtime.tls_g contains the TLS offset from GS.
- // add
+ // as runtime.tls_g contains the TLS offset from GS or FS.
+ // on AMD64 add
// MOVQ 0(BX)(GS*1), BX
+ // on 386 add
+ // MOVQ 0(BX)(FS*1), BX4
q := obj.Appendp(p, newprog)
q.As = p.As
q.From = obj.Addr{}
q.From.Type = obj.TYPE_MEM
q.From.Reg = p.To.Reg
- q.From.Index = REG_GS
+ if ctxt.Arch.Family == sys.AMD64 {
+ q.From.Index = REG_GS
+ } else {
+ q.From.Index = REG_FS
+ }
q.From.Scale = 1
q.From.Offset = 0
q.To = p.To
MOVL $runtime·tls_g(SB), 8(SP) // arg 3: &tls_g
#else
MOVL $0, BX
- MOVL BX, 12(SP) // arg 3,4: not used when using platform's TLS
- MOVL BX, 8(SP)
+ MOVL BX, 12(SP) // arg 4: not used when using platform's TLS
+#ifdef GOOS_windows
+ MOVL $runtime·tls_g(SB), 8(SP) // arg 3: &tls_g
+#else
+ MOVL BX, 8(SP) // arg 3: not used when using platform's TLS
+#endif
#endif
MOVL $setg_gcc<>(SB), BX
MOVL BX, 4(SP) // arg 2: setg_gcc
TEXT runtime·setg(SB), NOSPLIT, $0-4
MOVL gg+0(FP), BX
#ifdef GOOS_windows
+ MOVL runtime·tls_g(SB), CX
CMPL BX, $0
JNE settls
- MOVL $0, 0x14(FS)
+ MOVL $0, 0(CX)(FS)
RET
settls:
MOVL g_m(BX), AX
LEAL m_tls(AX), AX
- MOVL AX, 0x14(FS)
+ MOVL AX, 0(CX)(FS)
#endif
get_tls(CX)
MOVL BX, g(CX)
JMP done
TEXT ldt0setup<>(SB),NOSPLIT,$16-0
+#ifdef GOOS_windows
+ CALL runtime·wintls(SB)
+#endif
// set up ldt 7 to point at m0.tls
// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
// the entry number is just a hint. setldt will set up GS with what it used.
DATA runtime·tls_g+0(SB)/4, $8
GLOBL runtime·tls_g+0(SB), NOPTR, $4
#endif
+#ifdef GOOS_windows
+GLOBL runtime·tls_g+0(SB), NOPTR, $4
+#endif
#include "libcgo_windows.h"
static void threadentry(void*);
+static DWORD *tls_g;
void
-x_cgo_init(G *g)
+x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
{
+ tls_g = (DWORD *)tlsg;
}
* Set specific keys in thread local storage.
*/
asm volatile (
- "movl %0, %%fs:0x14\n" // MOVL tls0, 0x14(FS)
- "movl %%fs:0x14, %%eax\n" // MOVL 0x14(FS), tmp
- "movl %1, 0(%%eax)\n" // MOVL g, 0(FS)
- :: "r"(ts.tls), "r"(ts.g) : "%eax"
+ "movl %0, %%fs:0(%1)\n" // MOVL tls0, 0(tls_g)(FS)
+ "movl %%fs:0(%1), %%eax\n" // MOVL 0(tls_g)(FS), tmp
+ "movl %2, 0(%%eax)\n" // MOVL g, 0(AX)
+ :: "r"(ts.tls), "r"(*tls_g), "r"(ts.g) : "%eax"
);
crosscall_386(ts.fn);
#include "textflag.h"
#include "time_windows.h"
+// Offsets into Thread Environment Block (pointer in FS)
+#define TEB_TlsSlots 0xE10
+
// void runtime·asmstdcall(void *c);
TEXT runtime·asmstdcall(SB),NOSPLIT,$0
MOVL fn+0(FP), BX
RET
// void tstart(M *newm);
-TEXT tstart<>(SB),NOSPLIT,$0
+TEXT tstart<>(SB),NOSPLIT,$8-4
MOVL newm+0(FP), CX // m
MOVL m_g0(CX), DX // g
MOVL AX, g_stackguard1(DX)
// Set up tls.
- LEAL m_tls(CX), SI
- MOVL SI, 0x14(FS)
+ LEAL m_tls(CX), DI
MOVL CX, g_m(DX)
- MOVL DX, g(SI)
+ MOVL DX, g(DI)
+ MOVL DI, 4(SP)
+ CALL runtime·setldt(SB) // clobbers CX and DX
// Someday the convention will be D is always cleared.
CLD
RET
-// setldt(int entry, int address, int limit)
-TEXT runtime·setldt(SB),NOSPLIT,$0
- MOVL base+4(FP), CX
- MOVL CX, 0x14(FS)
+// setldt(int slot, int base, int size)
+TEXT runtime·setldt(SB),NOSPLIT,$0-12
+ MOVL base+4(FP), DX
+ MOVL runtime·tls_g(SB), CX
+ MOVL DX, 0(CX)(FS)
RET
// Runs on OS stack.
useQPC:
JMP runtime·nanotimeQPC(SB)
RET
+
+// This is called from rt0_go, which runs on the system stack
+// using the initial stack allocated by the OS.
+TEXT runtime·wintls(SB),NOSPLIT|NOFRAME,$0
+ // Allocate a TLS slot to hold g across calls to external code
+ MOVL SP, BP
+ MOVL runtime·_TlsAlloc(SB), AX
+ CALL AX
+ MOVL BP, SP
+
+ MOVL AX, CX // TLS index
+
+ // Assert that slot is less than 64 so we can use _TEB->TlsSlots
+ CMPL CX, $64
+ JB ok
+ CALL runtime·abort(SB)
+ok:
+ // Convert the TLS index at CX into
+ // an offset from TEB_TlsSlots.
+ SHLL $2, CX
+
+ // Save offset from TLS into tls_g.
+ ADDL $TEB_TlsSlots, CX
+ MOVL CX, runtime·tls_g(SB)
+ RET