// functions that do not grow the stack.
//go:norace
func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
+ // Set up and fork. This returns immediately in the parent or
+ // if there's an error.
+ r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
+ if locked {
+ runtime_AfterFork()
+ }
+ if err1 != 0 {
+ return 0, err1
+ }
+
+ // parent; return PID
+ pid = int(r1)
+
+ if sys.UidMappings != nil || sys.GidMappings != nil {
+ Close(p[0])
+ err := writeUidGidMappings(pid, sys)
+ var err2 Errno
+ if err != nil {
+ err2 = err.(Errno)
+ }
+ RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
+ Close(p[1])
+ }
+
+ return pid, 0
+}
+
+//go:norace
+func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
+ // vfork requires that the child not touch any of the parent's
+ // active stack frames. Hence, the child does all post-fork
+ // processing in this stack frame and never returns, while the
+ // parent returns immediately from this frame and does all
+ // post-fork processing in the outer frame.
// Declare all variables at top in case any
// declarations require heap allocation (e.g., err1).
var (
- r1 uintptr
- err1 Errno
err2 Errno
nextfd int
i int
- p [2]int
)
// Record parent PID so child can test if it has died.
// synchronizing writing of User ID/Group ID mappings.
if sys.UidMappings != nil || sys.GidMappings != nil {
if err := forkExecPipe(p[:]); err != nil {
- return 0, err.(Errno)
+ err1 = err.(Errno)
+ return
}
}
// About to call fork.
// No more allocation or calls of non-assembly functions.
runtime_BeforeFork()
+ locked = true
switch {
case runtime.GOARCH == "amd64" && sys.Cloneflags&CLONE_NEWUSER == 0:
r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
default:
r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
}
- if err1 != 0 {
- runtime_AfterFork()
- return 0, err1
- }
-
- if r1 != 0 {
- // parent; return PID
- runtime_AfterFork()
- pid = int(r1)
-
- if sys.UidMappings != nil || sys.GidMappings != nil {
- Close(p[0])
- err := writeUidGidMappings(pid, sys)
- if err != nil {
- err2 = err.(Errno)
- }
- RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
- Close(p[1])
- }
-
- return pid, 0
+ if err1 != 0 || r1 != 0 {
+ // If we're in the parent, we must return immediately
+ // so we're not in the same stack frame as the child.
+ // This can at most use the return PC, which the child
+ // will not modify, and the results of
+ // rawVforkSyscall, which must have been written after
+ // the child was replaced.
+ return
}
// Fork succeeded, now in child.