]> Cypherpunks repositories - gostls13.git/commitdiff
net: avoid internal hooks in TestDialTimeout
authorBryan C. Mills <bcmills@google.com>
Tue, 29 Aug 2023 15:26:45 +0000 (11:26 -0400)
committerGopher Robot <gobot@golang.org>
Tue, 29 Aug 2023 20:38:53 +0000 (20:38 +0000)
TestDialTimeout has historically been very flaky
(#11872, #13144, #22896, and now #56876),
apparently in part due to implementation details of the socktest
package it relies on.

In reviewing CL 467335, I noticed that TestDialTimeout is the last
remaining use of testHookDialChannel (added for #5349), and that that
hook no longer has any effect for Unix and Windows.

As an experiment, I tried removing both that hook and the call to
time.Sleep in the socktest filter, and to my surprise the test
continued to pass. That greatly undermined my confidence in the test,
since it appears that the “timeout” behavior it observes is caused by
the socktest filter injecting an error rather than anything in the net
package proper actually timing out.

To restore confidence in the test, I think it should be written
against only the public API of the net package, and should test the
publicly-documented behaviors. This change implements that approach.

Notably, when a timeout is set on a Dial call, that does not guarantee
that the listener will actually call Accept on the connection before
the timeout occurs: the kernel's network stack may preemptively accept
and buffer the connection on behalf of the listener. To avoid test
flakiness, the test must tolerate (and leave open) those spurious
connections: when the kernel has accepted enough of them, it will
start to block new connections until the buffered connections have
been accepted, and the expected timeout behavior will occur.

This also allows the test to run much more quickly and in parallel:
since we are relying on real timeouts instead of injected calls to
time.Sleep, we can set the timeouts to be much shorter and run
concurrently with other public-API tests without introducing races.

Fixes #56876.

Change-Id: I90dcb2ed70976e70857ca29c253ed760cb078a4c
Reviewed-on: https://go-review.googlesource.com/c/go/+/524055
TryBot-Bypass: Bryan Mills <bcmills@google.com>
Reviewed-by: Damien Neil <dneil@google.com>
Auto-Submit: Bryan Mills <bcmills@google.com>
Run-TryBot: Bryan Mills <bcmills@google.com>

src/net/hook_plan9.go [deleted file]
src/net/hook_unix.go
src/net/hook_windows.go
src/net/ipsock_plan9.go
src/net/timeout_test.go

diff --git a/src/net/hook_plan9.go b/src/net/hook_plan9.go
deleted file mode 100644 (file)
index e053348..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package net
-
-import "time"
-
-var testHookDialChannel = func() { time.Sleep(time.Millisecond) } // see golang.org/issue/5349
index 4e20f592180cac1afbb249914dd66579dfcd4fe5..89cb404c835a8fe5ddb557943890562c38a59da8 100644 (file)
@@ -9,7 +9,6 @@ package net
 import "syscall"
 
 var (
-       testHookDialChannel  = func() {} // for golang.org/issue/5349
        testHookCanceledDial = func() {} // for golang.org/issue/16523
 
        // Placeholders for socket system calls.
index 28c49cc6de7e7bcbaba710c9446759f8ce2677b9..946e1efa8df45bd47f98ed4ff2111768c7fe70fa 100644 (file)
@@ -7,12 +7,9 @@ package net
 import (
        "internal/syscall/windows"
        "syscall"
-       "time"
 )
 
 var (
-       testHookDialChannel = func() { time.Sleep(time.Millisecond) } // see golang.org/issue/5349
-
        // Placeholders for socket system calls.
        wsaSocketFunc func(int32, int32, int32, *syscall.WSAProtocolInfo, uint32, uint32) (syscall.Handle, error) = windows.WSASocket
        connectFunc   func(syscall.Handle, syscall.Sockaddr) error                                                = syscall.Connect
index 43287431c88c89ad581cff755b19ac5b96b97b59..c8d01804361177c4248de405ba0a3cce87682685 100644 (file)
@@ -181,7 +181,6 @@ func dialPlan9(ctx context.Context, net string, laddr, raddr Addr) (fd *netFD, e
        }
        resc := make(chan res)
        go func() {
-               testHookDialChannel()
                fd, err := dialPlan9Blocking(ctx, net, laddr, raddr)
                select {
                case resc <- res{fd, err}:
index c0bce57b949f6025977ab07b4415c8a04314bd86..4218025fc09e2987516dd8b364a21a39c93ecef0 100644 (file)
@@ -11,9 +11,9 @@ import (
        "fmt"
        "internal/testenv"
        "io"
-       "net/internal/socktest"
        "os"
        "runtime"
+       "strings"
        "sync"
        "testing"
        "time"
@@ -22,62 +22,113 @@ import (
 var dialTimeoutTests = []struct {
        timeout time.Duration
        delta   time.Duration // for deadline
-
-       guard time.Duration
 }{
        // Tests that dial timeouts, deadlines in the past work.
-       {-5 * time.Second, 0, -5 * time.Second},
-       {0, -5 * time.Second, -5 * time.Second},
-       {-5 * time.Second, 5 * time.Second, -5 * time.Second}, // timeout over deadline
-       {-1 << 63, 0, time.Second},
-       {0, -1 << 63, time.Second},
-
-       {50 * time.Millisecond, 0, 100 * time.Millisecond},
-       {0, 50 * time.Millisecond, 100 * time.Millisecond},
-       {50 * time.Millisecond, 5 * time.Second, 100 * time.Millisecond}, // timeout over deadline
+       {-5 * time.Second, 0},
+       {0, -5 * time.Second},
+       {-5 * time.Second, 5 * time.Second}, // timeout over deadline
+       {-1 << 63, 0},
+       {0, -1 << 63},
+
+       {1 * time.Millisecond, 0},
+       {0, 1 * time.Millisecond},
+       {1 * time.Millisecond, 5 * time.Second}, // timeout over deadline
 }
 
 func TestDialTimeout(t *testing.T) {
-       // Cannot use t.Parallel - modifies global hooks.
-       origTestHookDialChannel := testHookDialChannel
-       defer func() { testHookDialChannel = origTestHookDialChannel }()
-       defer sw.Set(socktest.FilterConnect, nil)
-
-       for i, tt := range dialTimeoutTests {
-               switch runtime.GOOS {
-               case "plan9", "windows":
-                       testHookDialChannel = func() { time.Sleep(tt.guard) }
-                       if runtime.GOOS == "plan9" {
-                               break
-                       }
-                       fallthrough
-               default:
-                       sw.Set(socktest.FilterConnect, func(so *socktest.Status) (socktest.AfterFilter, error) {
-                               time.Sleep(tt.guard)
-                               return nil, errTimedout
-                       })
-               }
+       switch runtime.GOOS {
+       case "plan9":
+               t.Skipf("not supported on %s", runtime.GOOS)
+       }
+
+       t.Parallel()
 
-               d := Dialer{Timeout: tt.timeout}
-               if tt.delta != 0 {
-                       d.Deadline = time.Now().Add(tt.delta)
+       ln := newLocalListener(t, "tcp")
+       defer func() {
+               if err := ln.Close(); err != nil {
+                       t.Error(err)
                }
+       }()
 
-               // This dial never starts to send any TCP SYN
-               // segment because of above socket filter and
-               // test hook.
-               c, err := d.Dial("tcp", "127.0.0.1:0")
-               if err == nil {
-                       err = fmt.Errorf("unexpectedly established: tcp:%s->%s", c.LocalAddr(), c.RemoteAddr())
+       // We expect the kernel to spuriously accept some number of connections on
+       // behalf of the listener, even when it hasn't called Accept yet.
+       var bufferedConns []Conn
+       t.Cleanup(func() {
+               t.Logf("ignored %d spurious connections", len(bufferedConns))
+               for _, c := range bufferedConns {
                        c.Close()
                }
+       })
 
-               if perr := parseDialError(err); perr != nil {
-                       t.Errorf("#%d: %v", i, perr)
-               }
-               if nerr, ok := err.(Error); !ok || !nerr.Timeout() {
-                       t.Fatalf("#%d: %v", i, err)
-               }
+       for _, tt := range dialTimeoutTests {
+               t.Run(fmt.Sprintf("%v/%v", tt.timeout, tt.delta), func(t *testing.T) {
+                       // We don't run these subtests in parallel because (at least on Linux)
+                       // that empirically causes many of the Dial calls to fail with
+                       // ECONNREFUSED instead of a timeout error.
+                       d := Dialer{Timeout: tt.timeout}
+                       if tt.delta != 0 {
+                               d.Deadline = time.Now().Add(tt.delta)
+                       }
+
+                       var (
+                               beforeDial time.Time
+                               afterDial  time.Time
+                               err        error
+                       )
+                       for err == nil {
+                               beforeDial = time.Now()
+                               var c Conn
+                               c, err = d.Dial(ln.Addr().Network(), ln.Addr().String())
+                               afterDial = time.Now()
+                               if err == nil {
+                                       // The connection was accepted before the timeout took effect; leave
+                                       // the connection open and try again. Eventually we will have so many
+                                       // open connections that the kernel stops buffering new ones, in which
+                                       // case the Dial calls should start to time out and return errors.
+                                       bufferedConns = append(bufferedConns, c)
+                               }
+                       }
+
+                       if strings.Contains(err.Error(), "connection reset by peer") && (testenv.Builder() == "" || runtime.GOOS == "freebsd") {
+                               // After we set up the connection on Unix, we make a call to
+                               // getsockopt to retrieve its status. Empirically, on some platforms
+                               // (notably FreeBSD 13), we may see ECONNRESET from that call instead
+                               // of a timeout when the listener's accept queue is full.
+                               //
+                               // We don't retry ECONNRESET errors in the saturation loop above,
+                               // because there is no upper bound on how often they will occur.
+                               // Empirically, with a 1ms timeout a single run of the test could
+                               // provoke upward of 100k ECONNRESETS, running for over 15s before
+                               // it finally trigged a timeout.
+                               //
+                               // We record this as a skipped subtest rather than a passing test so
+                               // that we can (potentially, one day) analyze it as such: this test
+                               // didn't fail, but it also didn't successfully provoke the intended
+                               // timeout behavior.
+                               //
+                               // We don't allow this on Go builders other than the freebsd builder
+                               // because we're not aware of any other platforms with this behavior,
+                               // and if the test suddenly starts skipping on other platforms we want
+                               // to know about it so that we can fix either the test or our Dial
+                               // implementation.
+                               t.Logf("Dial: %v", err)
+                               t.Skipf("skipping due to ECONNRESET with full accept queue")
+                       }
+
+                       if d.Deadline.IsZero() || afterDial.Before(d.Deadline) {
+                               delay := afterDial.Sub(beforeDial)
+                               if delay < tt.timeout {
+                                       t.Errorf("Dial returned after %v; want ≥%v", delay, tt.timeout)
+                               }
+                       }
+
+                       if perr := parseDialError(err); perr != nil {
+                               t.Errorf("unexpected error from Dial: %v", perr)
+                       }
+                       if nerr, ok := err.(Error); !ok || !nerr.Timeout() {
+                               t.Errorf("Dial: %v, want timeout", err)
+                       }
+               })
        }
 }