]> Cypherpunks repositories - gostls13.git/commitdiff
runtime,cmd/internal/obj/x86: use TEB TLS slots on windows/amd64
authorqmuntal <quimmuntal@gmail.com>
Mon, 19 Sep 2022 10:19:38 +0000 (12:19 +0200)
committerAlex Brainman <alex.brainman@gmail.com>
Mon, 14 Nov 2022 20:43:12 +0000 (20:43 +0000)
This CL redesign how we get the TLS pointer on windows/amd64.

We were previously reading it from the [TEB] arbitrary data slot,
located at 0x28(GS), which can only hold 1 TLS pointer.

With this CL, we will read the TLS pointer from the TEB TLS slot array,
located at 0x1480(GS). The TLS slot array can hold multiple
TLS pointers, up to 64, so multiple Go runtimes running on the
same thread can coexists with different TLS.

Each new TLS slot has to be allocated via [TlsAlloc],
which returns the slot index. This index can then be used to get the
slot offset from GS with the following formula: 0x1480 + index*8

The slot index is fixed per Go runtime, so we can store it
in runtime.tls_g and use it latter on to read/update the TLS pointer.

Loading the TLS pointer requires the following asm instructions:

  MOVQ runtime.tls_g, AX
  MOVQ AX(GS), AX

Notice that this approach is also implemented on windows/arm64.

[TEB]: https://en.wikipedia.org/wiki/Win32_Thread_Information_Block
[TlsAlloc]: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-tlsalloc

Updates #22192

Change-Id: Idea7119fd76a3cd083979a4d57ed64b552fa101b
Reviewed-on: https://go-review.googlesource.com/c/go/+/431775
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Quim Muntal <quimmuntal@gmail.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Alex Brainman <alex.brainman@gmail.com>
src/cmd/internal/obj/x86/asm6.go
src/cmd/internal/obj/x86/obj6.go
src/runtime/asm_amd64.s
src/runtime/cgo/gcc_windows_amd64.c
src/runtime/sys_windows_amd64.s

index 9faaba3759d7886e1d080df42dfff5b139b420eb..953eedc0d0622c0599bee8305328d8815c6ea995 100644 (file)
@@ -3612,7 +3612,7 @@ func (ab *AsmBuf) asmandsz(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj
                goto bad
        }
 
-       if a.Index != REG_NONE && a.Index != REG_TLS {
+       if a.Index != REG_NONE && a.Index != REG_TLS && !(REG_CS <= a.Index && a.Index <= REG_GS) {
                base := int(a.Reg)
                switch a.Name {
                case obj.NAME_EXTERN,
@@ -3724,7 +3724,8 @@ func (ab *AsmBuf) asmandsz(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj
        }
 
        if REG_AX <= base && base <= REG_R15 {
-               if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid {
+               if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid &&
+                       !(ctxt.Headtype == objabi.Hwindows && ctxt.Arch.Family == sys.AMD64) {
                        rel = obj.Reloc{}
                        rel.Type = objabi.R_TLS_LE
                        rel.Siz = 4
@@ -5205,21 +5206,6 @@ func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
                                                ab.Put2(0x64, // FS
                                                        0x8B)
                                                ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
-
-                                       case objabi.Hwindows:
-                                               // Windows TLS base is always 0x28(GS).
-                                               pp.From = p.From
-
-                                               pp.From.Type = obj.TYPE_MEM
-                                               pp.From.Name = obj.NAME_NONE
-                                               pp.From.Reg = REG_GS
-                                               pp.From.Offset = 0x28
-                                               pp.From.Index = REG_NONE
-                                               pp.From.Scale = 0
-                                               ab.rexflag |= Pw
-                                               ab.Put2(0x65, // GS
-                                                       0x8B)
-                                               ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
                                        }
                                }
                                return
index a82285a0d319448717072cc2d1f9447e8ca82340..85a426045304e652e46d49a30348b71baa26e052 100644 (file)
@@ -158,16 +158,33 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
                }
        }
 
-       // Android uses a tls offset determined at runtime. Rewrite
+       // Android and Win64 use a tls offset determined at runtime. Rewrite
        //      MOVQ TLS, BX
        // to
        //      MOVQ runtime.tls_g(SB), BX
-       if isAndroid && (p.As == AMOVQ || p.As == AMOVL) && p.From.Type == obj.TYPE_REG && p.From.Reg == REG_TLS && p.To.Type == obj.TYPE_REG && REG_AX <= p.To.Reg && p.To.Reg <= REG_R15 {
+       if (isAndroid || (ctxt.Headtype == objabi.Hwindows && ctxt.Arch.Family == sys.AMD64)) &&
+               (p.As == AMOVQ || p.As == AMOVL) && p.From.Type == obj.TYPE_REG && p.From.Reg == REG_TLS && p.To.Type == obj.TYPE_REG && REG_AX <= p.To.Reg && p.To.Reg <= REG_R15 {
                p.From.Type = obj.TYPE_MEM
                p.From.Name = obj.NAME_EXTERN
                p.From.Reg = REG_NONE
                p.From.Sym = ctxt.Lookup("runtime.tls_g")
                p.From.Index = REG_NONE
+               if ctxt.Headtype == objabi.Hwindows {
+                       // Win64 requires an additional indirection
+                       // to retrieve the TLS pointer,
+                       // as runtime.tls_g contains the TLS offset from GS.
+                       // add
+                       //      MOVQ 0(BX)(GS*1), BX
+                       q := obj.Appendp(p, newprog)
+                       q.As = p.As
+                       q.From = obj.Addr{}
+                       q.From.Type = obj.TYPE_MEM
+                       q.From.Reg = p.To.Reg
+                       q.From.Index = REG_GS
+                       q.From.Scale = 1
+                       q.From.Offset = 0
+                       q.To = p.To
+               }
        }
 
        // TODO: Remove.
index d2f798417898bc91578ecb793b577607539f7d5e..13c8de499e7d62a5a0501c5b6e99758735cdec97 100644 (file)
@@ -201,16 +201,16 @@ nocpuinfo:
        JZ      needtls
        // arg 1: g0, already in DI
        MOVQ    $setg_gcc<>(SB), SI // arg 2: setg_gcc
+       MOVQ    $0, DX  // arg 3, 4: not used when using platform's TLS
+       MOVQ    $0, CX
 #ifdef GOOS_android
        MOVQ    $runtime·tls_g(SB), DX         // arg 3: &tls_g
        // arg 4: TLS base, stored in slot 0 (Android's TLS_SLOT_SELF).
        // Compensate for tls_g (+16).
        MOVQ    -16(TLS), CX
-#else
-       MOVQ    $0, DX  // arg 3, 4: not used when using platform's TLS
-       MOVQ    $0, CX
 #endif
 #ifdef GOOS_windows
+       MOVQ    $runtime·tls_g(SB), DX         // arg 3: &tls_g
        // Adjust for the Win64 calling convention.
        MOVQ    CX, R9 // arg 4
        MOVQ    DX, R8 // arg 3
@@ -251,6 +251,10 @@ needtls:
        JMP ok
 #endif
 
+#ifdef GOOS_windows
+       CALL    runtime·wintls(SB)
+#endif
+
        LEAQ    runtime·m0+m_tls(SB), DI
        CALL    runtime·settls(SB)
 
@@ -2026,6 +2030,9 @@ TEXT runtime·panicSliceConvert<ABIInternal>(SB),NOSPLIT,$0-16
 DATA runtime·tls_g+0(SB)/8, $16
 GLOBL runtime·tls_g+0(SB), NOPTR, $8
 #endif
+#ifdef GOOS_windows
+GLOBL runtime·tls_g+0(SB), NOPTR, $8
+#endif
 
 // The compiler and assembler's -spectre=ret mode rewrites
 // all indirect CALL AX / JMP AX instructions to be
index 996947eccf4baa23e29fb1481859b575d514e119..3ff3c64565c31d087e616b8da9549663962f9efd 100644 (file)
 
 static void threadentry(void*);
 static void (*setg_gcc)(void*);
+static DWORD *tls_g;
 
 void
 x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 {
        setg_gcc = setg;
+       tls_g = (DWORD *)tlsg;
 }
 
 
@@ -41,8 +43,8 @@ threadentry(void *v)
         * Set specific keys in thread local storage.
         */
        asm volatile (
-         "movq %0, %%gs:0x28\n"        // MOVL tls0, 0x28(GS)
-         :: "r"(ts.tls)
+         "movq %0, %%gs:0(%1)\n"       // MOVL tls0, 0(tls_g)(GS)
+         :: "r"(ts.tls), "r"(*tls_g)
        );
 
        crosscall_amd64(ts.fn, setg_gcc, (void*)ts.g);
index 4e00f64fae27048da58c6ba2b95a1c39fe05a038..777726f7c11fc57b5a726fff7341ad72127f699c 100644 (file)
@@ -8,6 +8,9 @@
 #include "time_windows.h"
 #include "cgo/abi_amd64.h"
 
+// Offsets into Thread Environment Block (pointer in GS)
+#define TEB_TlsSlots 0x1480
+
 // void runtime·asmstdcall(void *c);
 TEXT runtime·asmstdcall(SB),NOSPLIT|NOFRAME,$0
        // asmcgocall will put first argument into CX.
@@ -303,10 +306,10 @@ TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
        MOVQ    AX, g_stackguard1(DX)
 
        // Set up tls.
-       LEAQ    m_tls(CX), SI
-       MOVQ    SI, 0x28(GS)
+       LEAQ    m_tls(CX), DI
        MOVQ    CX, g_m(DX)
-       MOVQ    DX, g(SI)
+       MOVQ    DX, g(DI)
+       CALL    runtime·settls(SB) // clobbers CX
 
        CALL    runtime·stackcheck(SB) // clobbers AX,CX
        CALL    runtime·mstart(SB)
@@ -318,7 +321,8 @@ TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
 
 // set tls base to DI
 TEXT runtime·settls(SB),NOSPLIT,$0
-       MOVQ    DI, 0x28(GS)
+       MOVQ    runtime·tls_g(SB), CX
+       MOVQ    DI, 0(CX)(GS)
        RET
 
 // Runs on OS stack.
@@ -404,3 +408,32 @@ TEXT runtime·osSetupTLS(SB),NOSPLIT,$0-8
        LEAQ    m_tls(AX), DI
        CALL    runtime·settls(SB)
        RET
+
+// This is called from rt0_go, which runs on the system stack
+// using the initial stack allocated by the OS.
+TEXT runtime·wintls(SB),NOSPLIT|NOFRAME,$0
+       // Allocate a TLS slot to hold g across calls to external code
+       MOVQ    SP, AX
+       ANDQ    $~15, SP        // alignment as per Windows requirement
+       SUBQ    $48, SP // room for SP and 4 args as per Windows requirement
+                       // plus one extra word to keep stack 16 bytes aligned
+       MOVQ    AX, 32(SP)
+       MOVQ    runtime·_TlsAlloc(SB), AX
+       CALL    AX
+       MOVQ    32(SP), SP
+
+       MOVQ    AX, CX  // TLS index
+
+       // Assert that slot is less than 64 so we can use _TEB->TlsSlots
+       CMPQ    CX, $64
+       JB      ok
+       CALL    runtime·abort(SB)
+ok:
+       // Convert the TLS index at CX into
+       // an offset from TEB_TlsSlots.
+       SHLQ    $3, CX
+
+       // Save offset from TLS into tls_g.
+       ADDQ    $TEB_TlsSlots, CX
+       MOVQ    CX, runtime·tls_g(SB)
+       RET