From: Bryan C. Mills Date: Thu, 31 Aug 2023 19:22:14 +0000 (-0400) Subject: net: deflake TestDialTimeout on windows X-Git-Tag: go1.22rc1~1013 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=e3ef8d18102d923a1dbd499ce5ae21ee70e13638;p=gostls13.git net: deflake TestDialTimeout on windows The time granularity on windows is large enough that setting even an implausibly small timeout still gives ConnectEx enough time to succeed before the timeout expires. That causes TestDialTimeout to sometimes flake, because it expects to be able to provoke a timeout using some nonzero duration. This change takes a two-pronged approach to address the problem: 1. We can set a deadline on the FD more aggressively. (If the Context has already expired, or the deadline is already known, we can go ahead and set it on the fd without waiting for a background goroutine to get around to it.) 2. We can reintroduce a test hook to ensure that Dial takes a measurable amount of time before it completes, so that setting an implausibly short deadline sets that deadline in the past instead of the future. Together, these reduce the flake rate on a windows-amd64-longtest gomote from around 1-in-10 to less than 1-in-2000. For #62377. Change-Id: I03975c32f61fffa9f6f84efb3c474a01ac5a0d1e Reviewed-on: https://go-review.googlesource.com/c/go/+/524936 Run-TryBot: Bryan Mills Reviewed-by: Ian Lance Taylor Reviewed-by: Damien Neil Auto-Submit: Bryan Mills TryBot-Result: Gopher Robot LUCI-TryBot-Result: Go LUCI --- diff --git a/src/net/dial.go b/src/net/dial.go index 79bc4958bb..dd34b6cef2 100644 --- a/src/net/dial.go +++ b/src/net/dial.go @@ -457,6 +457,7 @@ func (d *Dialer) DialContext(ctx context.Context, network, address string) (Conn } deadline := d.deadline(ctx, time.Now()) if !deadline.IsZero() { + testHookStepTime() if d, ok := ctx.Deadline(); !ok || deadline.Before(d) { subCtx, cancel := context.WithDeadline(ctx, deadline) defer cancel() diff --git a/src/net/fd_windows.go b/src/net/fd_windows.go index eeb994dfd9..45a10cf1eb 100644 --- a/src/net/fd_windows.go +++ b/src/net/fd_windows.go @@ -64,10 +64,38 @@ func (fd *netFD) connect(ctx context.Context, la, ra syscall.Sockaddr) (syscall. if err := fd.init(); err != nil { return nil, err } - if deadline, ok := ctx.Deadline(); ok && !deadline.IsZero() { - fd.pfd.SetWriteDeadline(deadline) + + if ctx.Done() != nil { + // Propagate the Context's deadline and cancellation. + // If the context is already done, or if it has a nonzero deadline, + // ensure that that is applied before the call to ConnectEx begins + // so that we don't return spurious connections. defer fd.pfd.SetWriteDeadline(noDeadline) + + if ctx.Err() != nil { + fd.pfd.SetWriteDeadline(aLongTimeAgo) + } else { + if deadline, ok := ctx.Deadline(); ok && !deadline.IsZero() { + fd.pfd.SetWriteDeadline(deadline) + } + + done := make(chan struct{}) + stop := context.AfterFunc(ctx, func() { + // Force the runtime's poller to immediately give + // up waiting for writability. + fd.pfd.SetWriteDeadline(aLongTimeAgo) + close(done) + }) + defer func() { + if !stop() { + // Wait for the call to SetWriteDeadline to complete so that we can + // reset the deadline if everything else succeeded. + <-done + } + }() + } } + if !canUseConnectEx(fd.net) { err := connectFunc(fd.pfd.Sysfd, ra) return nil, os.NewSyscallError("connect", err) @@ -113,22 +141,6 @@ func (fd *netFD) connect(ctx context.Context, la, ra syscall.Sockaddr) (syscall. _ = fd.pfd.WSAIoctl(windows.SIO_TCP_INITIAL_RTO, (*byte)(unsafe.Pointer(¶ms)), uint32(unsafe.Sizeof(params)), nil, 0, &out, nil, 0) } - // Wait for the goroutine converting context.Done into a write timeout - // to exist, otherwise our caller might cancel the context and - // cause fd.setWriteDeadline(aLongTimeAgo) to cancel a successful dial. - done := make(chan bool) // must be unbuffered - defer func() { done <- true }() - go func() { - select { - case <-ctx.Done(): - // Force the runtime's poller to immediately give - // up waiting for writability. - fd.pfd.SetWriteDeadline(aLongTimeAgo) - <-done - case <-done: - } - }() - // Call ConnectEx API. if err := fd.pfd.ConnectEx(ra); err != nil { select { diff --git a/src/net/hook.go b/src/net/hook.go index ea71803e22..35c660b4a3 100644 --- a/src/net/hook.go +++ b/src/net/hook.go @@ -23,4 +23,10 @@ var ( return fn(ctx, network, host) } testHookSetKeepAlive = func(time.Duration) {} + + // testHookStepTime sleeps until time has moved forward by a nonzero amount. + // This helps to avoid flakes in timeout tests by ensuring that an implausibly + // short deadline (such as 1ns in the future) is always expired by the time + // a relevant system call occurs. + testHookStepTime = func() {} ) diff --git a/src/net/timeout_test.go b/src/net/timeout_test.go index 581e1148c0..2e23b2f5df 100644 --- a/src/net/timeout_test.go +++ b/src/net/timeout_test.go @@ -18,6 +18,21 @@ import ( "time" ) +func init() { + // Install a hook to ensure that a 1ns timeout will always + // be exceeded by the time Dial gets to the relevant system call. + // + // Without this, systems with a very large timer granularity — such as + // Windows — may be able to accept connections without measurably exceeding + // even an implausibly short deadline. + testHookStepTime = func() { + now := time.Now() + for time.Since(now) == 0 { + time.Sleep(1 * time.Nanosecond) + } + } +} + var dialTimeoutTests = []struct { initialTimeout time.Duration initialDelta time.Duration // for deadline