From dba623b1c7663016c79edbec517f8c8e7feb1437 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 23 Jul 2013 18:40:02 -0400 Subject: [PATCH] runtime: reduce frame size for runtime.cgocallback_gofunc Tying preemption to stack splits means that we have to able to complete the call to exitsyscall (inside cgocallbackg at least for now) without any stack split checks, meaning that the whole sequence has to work within 128 bytes of stack, unless we increase the size of the red zone. This CL frees up 24 bytes along that critical path on amd64. (The 32-bit systems have plenty of space because all their words are smaller.) R=dvyukov CC=golang-dev https://golang.org/cl/11676043 --- src/pkg/runtime/asm_386.s | 61 +++++++++++++++---------------------- src/pkg/runtime/asm_amd64.s | 59 ++++++++++++++--------------------- src/pkg/runtime/asm_arm.s | 48 +++++++++++------------------ src/pkg/runtime/cgocall.c | 23 +++++++++++--- src/pkg/runtime/proc.c | 8 ++--- 5 files changed, 88 insertions(+), 111 deletions(-) diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s index 5238e59437..6b0739b2ec 100644 --- a/src/pkg/runtime/asm_386.s +++ b/src/pkg/runtime/asm_386.s @@ -524,7 +524,7 @@ TEXT runtime·cgocallback(SB),7,$12-12 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) // See cgocall.c for more details. -TEXT runtime·cgocallback_gofunc(SB),7,$12-12 +TEXT runtime·cgocallback_gofunc(SB),7,$8-12 // If m is nil, Go did not create the current thread. // Call needm to obtain one for temporary use. // In this case, we're running on the thread stack, so there's @@ -532,13 +532,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12 // the linker analysis by using an indirect call through AX. get_tls(CX) #ifdef GOOS_windows + MOVL $0, BP CMPL CX, $0 - JNE 3(PC) - PUSHL $0 - JMP needm + JNE 2(PC) #endif MOVL m(CX), BP - PUSHL BP + MOVL BP, 4(SP) CMPL BP, $0 JNE havem needm: @@ -552,55 +551,42 @@ havem: // Save current m->g0->sched.sp on stack and then set it to SP. // Save current sp in m->g0->sched.sp in preparation for // switch back to m->curg stack. + // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). MOVL m_g0(BP), SI - PUSHL (g_sched+gobuf_sp)(SI) + MOVL (g_sched+gobuf_sp)(SI), AX + MOVL AX, 0(SP) MOVL SP, (g_sched+gobuf_sp)(SI) - // Switch to m->curg stack and call runtime.cgocallbackg - // with the three arguments. Because we are taking over - // the execution of m->curg but *not* resuming what had - // been running, we need to save that information (m->curg->sched) - // so that we can restore it when we're done. + // Switch to m->curg stack and call runtime.cgocallbackg. + // Because we are taking over the execution of m->curg + // but *not* resuming what had been running, we need to + // save that information (m->curg->sched) so we can restore it. // We can restore m->curg->sched.sp easily, because calling // runtime.cgocallbackg leaves SP unchanged upon return. // To save m->curg->sched.pc, we push it onto the stack. // This has the added benefit that it looks to the traceback // routine like cgocallbackg is going to return to that - // PC (because we defined cgocallbackg to have - // a frame size of 12, the same amount that we use below), + // PC (because the frame we allocate below has the same + // size as cgocallback_gofunc's frame declared above) // so that the traceback will seamlessly trace back into // the earlier calls. - MOVL fn+0(FP), AX - MOVL frame+4(FP), BX - MOVL framesize+8(FP), DX - + // + // In the new goroutine, 0(SP) and 4(SP) are unused except + // on Windows, where they are the SEH block. MOVL m_curg(BP), SI MOVL SI, g(CX) - MOVL (g_sched+gobuf_sp)(SI), DI // prepare stack as DI - - // Push gobuf.pc + MOVL (g_sched+gobuf_sp)(SI), DI // prepare stack as DI MOVL (g_sched+gobuf_pc)(SI), BP - SUBL $4, DI - MOVL BP, 0(DI) - - // Push arguments to cgocallbackg. - // Frame size here must match the frame size above plus the pushes - // to trick traceback routines into doing the right thing. - SUBL $20, DI - MOVL AX, 0(DI) - MOVL BX, 4(DI) - MOVL DX, 8(DI) - - // Switch stack and make the call. - MOVL DI, SP + MOVL BP, -4(DI) + LEAL -(4+8)(DI), SP CALL runtime·cgocallbackg(SB) // Restore g->sched (== m->curg->sched) from saved values. get_tls(CX) MOVL g(CX), SI - MOVL 20(SP), BP + MOVL 8(SP), BP MOVL BP, (g_sched+gobuf_pc)(SI) - LEAL (20+4)(SP), DI + LEAL (8+4)(SP), DI MOVL DI, (g_sched+gobuf_sp)(SI) // Switch back to m->g0's stack and restore m->g0->sched.sp. @@ -610,11 +596,12 @@ havem: MOVL m_g0(BP), SI MOVL SI, g(CX) MOVL (g_sched+gobuf_sp)(SI), SP - POPL (g_sched+gobuf_sp)(SI) + MOVL 0(SP), AX + MOVL AX, (g_sched+gobuf_sp)(SI) // If the m on entry was nil, we called needm above to borrow an m // for the duration of the call. Since the call is over, return it with dropm. - POPL BP + MOVL 8(SP), BP CMPL BP, $0 JNE 3(PC) MOVL $runtime·dropm(SB), AX diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s index f8f77124d9..1ec635516d 100644 --- a/src/pkg/runtime/asm_amd64.s +++ b/src/pkg/runtime/asm_amd64.s @@ -563,7 +563,7 @@ TEXT runtime·cgocallback(SB),7,$24-24 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) // See cgocall.c for more details. -TEXT runtime·cgocallback_gofunc(SB),7,$24-24 +TEXT runtime·cgocallback_gofunc(SB),7,$16-24 // If m is nil, Go did not create the current thread. // Call needm to obtain one for temporary use. // In this case, we're running on the thread stack, so there's @@ -571,13 +571,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$24-24 // the linker analysis by using an indirect call through AX. get_tls(CX) #ifdef GOOS_windows + MOVL $0, BP CMPQ CX, $0 - JNE 3(PC) - PUSHQ $0 - JMP needm + JNE 2(PC) #endif MOVQ m(CX), BP - PUSHQ BP + MOVQ BP, 8(SP) CMPQ BP, $0 JNE havem needm: @@ -591,55 +590,42 @@ havem: // Save current m->g0->sched.sp on stack and then set it to SP. // Save current sp in m->g0->sched.sp in preparation for // switch back to m->curg stack. + // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). MOVQ m_g0(BP), SI - PUSHQ (g_sched+gobuf_sp)(SI) + MOVQ (g_sched+gobuf_sp)(SI), AX + MOVQ AX, 0(SP) MOVQ SP, (g_sched+gobuf_sp)(SI) - // Switch to m->curg stack and call runtime.cgocallbackg - // with the three arguments. Because we are taking over - // the execution of m->curg but *not* resuming what had - // been running, we need to save that information (m->curg->sched) - // so that we can restore it when we're done. + // Switch to m->curg stack and call runtime.cgocallbackg. + // Because we are taking over the execution of m->curg + // but *not* resuming what had been running, we need to + // save that information (m->curg->sched) so we can restore it. // We can restore m->curg->sched.sp easily, because calling // runtime.cgocallbackg leaves SP unchanged upon return. // To save m->curg->sched.pc, we push it onto the stack. // This has the added benefit that it looks to the traceback // routine like cgocallbackg is going to return to that - // PC (because we defined cgocallbackg to have - // a frame size of 24, the same amount that we use below), + // PC (because the frame we allocate below has the same + // size as cgocallback_gofunc's frame declared above) // so that the traceback will seamlessly trace back into // the earlier calls. - MOVQ fn+0(FP), AX - MOVQ frame+8(FP), BX - MOVQ framesize+16(FP), DX - + // + // In the new goroutine, 0(SP) and 8(SP) are unused except + // on Windows, where they are the SEH block. MOVQ m_curg(BP), SI MOVQ SI, g(CX) MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI - - // Push gobuf.pc MOVQ (g_sched+gobuf_pc)(SI), BP - SUBQ $8, DI - MOVQ BP, 0(DI) - - // Push arguments to cgocallbackg. - // Frame size here must match the frame size above plus the pushes - // to trick traceback routines into doing the right thing. - SUBQ $40, DI - MOVQ AX, 0(DI) - MOVQ BX, 8(DI) - MOVQ DX, 16(DI) - - // Switch stack and make the call. - MOVQ DI, SP + MOVQ BP, -8(DI) + LEAQ -(8+16)(DI), SP CALL runtime·cgocallbackg(SB) // Restore g->sched (== m->curg->sched) from saved values. get_tls(CX) MOVQ g(CX), SI - MOVQ 40(SP), BP + MOVQ 16(SP), BP MOVQ BP, (g_sched+gobuf_pc)(SI) - LEAQ (40+8)(SP), DI + LEAQ (16+8)(SP), DI MOVQ DI, (g_sched+gobuf_sp)(SI) // Switch back to m->g0's stack and restore m->g0->sched.sp. @@ -649,11 +635,12 @@ havem: MOVQ m_g0(BP), SI MOVQ SI, g(CX) MOVQ (g_sched+gobuf_sp)(SI), SP - POPQ (g_sched+gobuf_sp)(SI) + MOVQ 0(SP), AX + MOVQ AX, (g_sched+gobuf_sp)(SI) // If the m on entry was nil, we called needm above to borrow an m // for the duration of the call. Since the call is over, return it with dropm. - POPQ BP + MOVQ 8(SP), BP CMPQ BP, $0 JNE 3(PC) MOVQ $runtime·dropm(SB), AX diff --git a/src/pkg/runtime/asm_arm.s b/src/pkg/runtime/asm_arm.s index be6d29b567..863d9a5064 100644 --- a/src/pkg/runtime/asm_arm.s +++ b/src/pkg/runtime/asm_arm.s @@ -331,7 +331,7 @@ TEXT runtime·cgocallback(SB),7,$12-12 // cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize) // See cgocall.c for more details. -TEXT runtime·cgocallback_gofunc(SB),7,$12-12 +TEXT runtime·cgocallback_gofunc(SB),7,$8-12 // Load m and g from thread-local storage. MOVW _cgo_load_gm(SB), R0 CMP $0, R0 @@ -342,7 +342,7 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12 // In this case, we're running on the thread stack, so there's // lots of space, but the linker doesn't know. Hide the call from // the linker analysis by using an indirect call. - MOVW m, savedm-12(SP) + MOVW m, savedm-4(SP) CMP $0, m B.NE havem MOVW $runtime·needm(SB), R0 @@ -353,51 +353,41 @@ havem: // Save current m->g0->sched.sp on stack and then set it to SP. // Save current sp in m->g0->sched.sp in preparation for // switch back to m->curg stack. + // NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP). MOVW m_g0(m), R3 MOVW (g_sched+gobuf_sp)(R3), R4 - MOVW.W R4, -4(R13) + MOVW R4, savedsp-8(SP) MOVW R13, (g_sched+gobuf_sp)(R3) - // Switch to m->curg stack and call runtime.cgocallbackg - // with the three arguments. Because we are taking over - // the execution of m->curg but *not* resuming what had - // been running, we need to save that information (m->curg->sched) - // so that we can restore it when we're done. + // Switch to m->curg stack and call runtime.cgocallbackg. + // Because we are taking over the execution of m->curg + // but *not* resuming what had been running, we need to + // save that information (m->curg->sched) so we can restore it. // We can restore m->curg->sched.sp easily, because calling // runtime.cgocallbackg leaves SP unchanged upon return. // To save m->curg->sched.pc, we push it onto the stack. // This has the added benefit that it looks to the traceback // routine like cgocallbackg is going to return to that - // PC (because we defined cgocallbackg to have - // a frame size of 12, the same amount that we use below), + // PC (because the frame we allocate below has the same + // size as cgocallback_gofunc's frame declared above) // so that the traceback will seamlessly trace back into // the earlier calls. + // + // In the new goroutine, -8(SP) and -4(SP) are unused. MOVW fn+4(FP), R0 MOVW frame+8(FP), R1 MOVW framesize+12(FP), R2 - MOVW m_curg(m), g MOVW (g_sched+gobuf_sp)(g), R4 // prepare stack as R4 - - // Push gobuf.pc - // Frame size here must match the frame size above plus the push - // to trick traceback routines into doing the right thing. MOVW (g_sched+gobuf_pc)(g), R5 - MOVW.W R5, -20(R4) - - // Push arguments to cgocallbackg. - MOVW R0, 4(R4) - MOVW R1, 8(R4) - MOVW R2, 12(R4) - - // Switch stack and make the call. - MOVW R4, R13 + MOVW R5, -12(R4) + MOVW $-12(R4), R13 BL runtime·cgocallbackg(SB) // Restore g->sched (== m->curg->sched) from saved values. MOVW 0(R13), R5 MOVW R5, (g_sched+gobuf_pc)(g) - ADD $(16+4), R13, R4 + MOVW $12(R13), R4 MOVW R4, (g_sched+gobuf_sp)(g) // Switch back to m->g0's stack and restore m->g0->sched.sp. @@ -405,14 +395,12 @@ havem: // so we do not have to restore it.) MOVW m_g0(m), g MOVW (g_sched+gobuf_sp)(g), R13 - // POP R6 - MOVW 0(R13), R6 - ADD $4, R13 - MOVW R6, (g_sched+gobuf_sp)(g) + MOVW savedsp-8(SP), R4 + MOVW R4, (g_sched+gobuf_sp)(g) // If the m on entry was nil, we called needm above to borrow an m // for the duration of the call. Since the call is over, return it with dropm. - MOVW savedm-12(SP), R6 + MOVW savedm-4(SP), R6 CMP $0, R6 B.NE 3(PC) MOVW $runtime·dropm(SB), R0 diff --git a/src/pkg/runtime/cgocall.c b/src/pkg/runtime/cgocall.c index 16bc765549..a624509cfe 100644 --- a/src/pkg/runtime/cgocall.c +++ b/src/pkg/runtime/cgocall.c @@ -228,13 +228,25 @@ runtime·cfree(void *p) static FuncVal unwindmf = {unwindm}; +typedef struct CallbackArgs CallbackArgs; +struct CallbackArgs +{ + FuncVal *fn; + void *arg; + uintptr argsize; +}; + +#define CBARGS (CallbackArgs*)((byte*)m->g0->sched.sp+(3+(thechar=='5'))*sizeof(void*)) + void -runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize) +runtime·cgocallbackg(void) { Defer d; + CallbackArgs *cb; if(m->racecall) { - reflect·call(fn, arg, argsize); + cb = CBARGS; + reflect·call(cb->fn, cb->arg, cb->argsize); return; } @@ -261,7 +273,8 @@ runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize) runtime·raceacquire(&cgosync); // Invoke callback. - reflect·call(fn, arg, argsize); + cb = CBARGS; + reflect·call(cb->fn, cb->arg, cb->argsize); if(raceenabled) runtime·racereleasemerge(&cgosync); @@ -286,9 +299,11 @@ unwindm(void) runtime·throw("runtime: unwindm not implemented"); case '8': case '6': - case '5': m->g0->sched.sp = *(uintptr*)m->g0->sched.sp; break; + case '5': + m->g0->sched.sp = *(uintptr*)((byte*)m->g0->sched.sp + 4); + break; } } diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c index 9530b9984a..3ae9fe7273 100644 --- a/src/pkg/runtime/proc.c +++ b/src/pkg/runtime/proc.c @@ -651,10 +651,10 @@ runtime·needm(byte x) g->stackguard0 = g->stackguard; // On windows/386, we need to put an SEH frame (two words) - // somewhere on the current stack. We are called - // from needm, and we know there is some available - // space one word into the argument frame. Use that. - m->seh = (SEH*)((uintptr*)&x + 1); + // somewhere on the current stack. We are called from cgocallback_gofunc + // and we know that it will leave two unused words below m->curg->sched.sp. + // Use those. + m->seh = (SEH*)((uintptr*)m->curg->sched.sp - 3); // Initialize this thread to use the m. runtime·asminit(); -- 2.48.1