os: add clone(CLONE_PIDFD) check to pidfd feature check

author Michael Pratt <mpratt@google.com>

Tue, 11 Jun 2024 20:34:38 +0000 (16:34 -0400)

committer Michael Pratt <mpratt@google.com>

Thu, 12 Sep 2024 15:45:38 +0000 (15:45 +0000)
author Michael Pratt <mpratt@google.com>
Tue, 11 Jun 2024 20:34:38 +0000 (16:34 -0400)
committer Michael Pratt <mpratt@google.com>
Thu, 12 Sep 2024 15:45:38 +0000 (15:45 +0000)
diff --git a/src/os/pidfd_linux.go b/src/os/pidfd_linux.go

index 545cfe9613b8b4e5710c16efc863a8d2d6c72953..459d88cb445b427dc8197778759c7df4fe40b409 100644 (file)
--- a/src/os/pidfd_linux.go
+++ b/src/os/pidfd_linux.go
@@ -8,6 +8,10 @@
  //  v5.3: pidfd_open syscall, clone3 syscall;
  //  v5.4: P_PIDFD idtype support for waitid syscall;
  //  v5.6: pidfd_getfd syscall.
+//
+// N.B. Alternative Linux implementations may not follow this ordering. e.g.,
+// QEMU user mode 7.2 added pidfd_open, but CLONE_PIDFD was not added until
+// 8.0.
  
  package os
  
@@ -139,9 +143,9 @@ func pidfdWorks() bool {
  
  var checkPidfdOnce = sync.OnceValue(checkPidfd)
  
-// checkPidfd checks whether all required pidfd-related syscalls work.
-// This consists of pidfd_open and pidfd_send_signal syscalls, and waitid
-// syscall with idtype of P_PIDFD.
+// checkPidfd checks whether all required pidfd-related syscalls work. This
+// consists of pidfd_open and pidfd_send_signal syscalls, waitid syscall with
+// idtype of P_PIDFD, and clone(CLONE_PIDFD).
  //
  // Reasons for non-working pidfd syscalls include an older kernel and an
  // execution environment in which the above system calls are restricted by
@@ -172,5 +176,19 @@ func checkPidfd() error {
                 return NewSyscallError("pidfd_send_signal", err)
         }
  
+       // Verify that clone(CLONE_PIDFD) works.
+       //
+       // This shouldn't be necessary since pidfd_open was added in Linux 5.3,
+       // after CLONE_PIDFD in Linux 5.2, but some alternative Linux
+       // implementations may not adhere to this ordering.
+       if err := checkClonePidfd(); err != nil {
+               return err
+       }
+
         return nil
  }
+
+// Provided by syscall.
+//
+//go:linkname checkClonePidfd
+func checkClonePidfd() error
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go

index 1859a58294d5b2dcc0f401115a7bd0fec4ba3c04..429a84635ae789184513fa83359288fcf683e644 100644 (file)
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@@ -7,6 +7,7 @@
  package syscall
  
  import (
+       errpkg "errors"
         "internal/itoa"
         "runtime"
         "unsafe"
@@ -330,6 +331,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
         if clone3 != nil {
                 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
         } else {
+               // N.B. Keep in sync with doCheckClonePidfd.
                 flags |= uintptr(SIGCHLD)
                 if runtime.GOARCH == "s390x" {
                         // On Linux/s390, the first two arguments of clone(2) are swapped.
@@ -758,3 +760,82 @@ func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
                 *sys.PidFD = -1
         }
  }
+
+// checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a
+// clone.
+//
+//go:linkname os_checkClonePidfd os.checkClonePidfd
+func os_checkClonePidfd() error {
+       pidfd := int32(-1)
+       pid, errno := doCheckClonePidfd(&pidfd)
+       if errno != 0 {
+               return errno
+       }
+
+       if pidfd == -1 {
+               // Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process
+               // before returning.
+
+               var err error
+               for {
+                       var status WaitStatus
+                       _, err = Wait4(int(pid), &status, 0, nil)
+                       if err != EINTR {
+                               break
+                       }
+               }
+               if err != nil {
+                       return err
+               }
+
+               return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd")
+       }
+
+       // Good: CLONE_PIDFD provided a pidfd. Reap the process and close the
+       // pidfd.
+       defer Close(int(pidfd))
+
+       for {
+               const _P_PIDFD = 3
+               _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0)
+               if errno != EINTR {
+                       break
+               }
+       }
+       if errno != 0 {
+               return errno
+       }
+
+       return nil
+}
+
+// doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and
+// child execution. This is a separate function so we can separate the child's
+// and parent's stack frames if we're using vfork.
+//
+// This is go:noinline because the point is to keep the stack frames of this
+// and os_checkClonePidfd separate.
+//
+//go:noinline
+func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) {
+       flags := uintptr(CLONE_VFORK|CLONE_VM|CLONE_PIDFD|SIGCHLD)
+       if runtime.GOARCH == "s390x" {
+               // On Linux/s390, the first two arguments of clone(2) are swapped.
+               pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd)))
+       } else {
+               pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd)))
+       }
+       if errno != 0 || pid != 0 {
+               // If we're in the parent, we must return immediately
+               // so we're not in the same stack frame as the child.
+               // This can at most use the return PC, which the child
+               // will not modify, and the results of
+               // rawVforkSyscall, which must have been written after
+               // the child was replaced.
+               return
+       }
+
+       for {
+               RawSyscall(SYS_EXIT, 0, 0, 0)
+       }
+}
author	Michael Pratt <mpratt@google.com>
	Tue, 11 Jun 2024 20:34:38 +0000 (16:34 -0400)
committer	Michael Pratt <mpratt@google.com>
	Thu, 12 Sep 2024 15:45:38 +0000 (15:45 +0000)
src/os/pidfd_linux.go		patch \| blob \| history
src/syscall/exec_linux.go		patch \| blob \| history