From a9d4b2dbe47008677b6abb04a7b9c8327eb4d21b Mon Sep 17 00:00:00 2001 From: "Bryan C. Mills" Date: Tue, 29 Aug 2023 11:26:45 -0400 Subject: [PATCH] net: avoid internal hooks in TestDialTimeout MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit TestDialTimeout has historically been very flaky (#11872, #13144, #22896, and now #56876), apparently in part due to implementation details of the socktest package it relies on. In reviewing CL 467335, I noticed that TestDialTimeout is the last remaining use of testHookDialChannel (added for #5349), and that that hook no longer has any effect for Unix and Windows. As an experiment, I tried removing both that hook and the call to time.Sleep in the socktest filter, and to my surprise the test continued to pass. That greatly undermined my confidence in the test, since it appears that the “timeout” behavior it observes is caused by the socktest filter injecting an error rather than anything in the net package proper actually timing out. To restore confidence in the test, I think it should be written against only the public API of the net package, and should test the publicly-documented behaviors. This change implements that approach. Notably, when a timeout is set on a Dial call, that does not guarantee that the listener will actually call Accept on the connection before the timeout occurs: the kernel's network stack may preemptively accept and buffer the connection on behalf of the listener. To avoid test flakiness, the test must tolerate (and leave open) those spurious connections: when the kernel has accepted enough of them, it will start to block new connections until the buffered connections have been accepted, and the expected timeout behavior will occur. This also allows the test to run much more quickly and in parallel: since we are relying on real timeouts instead of injected calls to time.Sleep, we can set the timeouts to be much shorter and run concurrently with other public-API tests without introducing races. Fixes #56876. Change-Id: I90dcb2ed70976e70857ca29c253ed760cb078a4c Reviewed-on: https://go-review.googlesource.com/c/go/+/524055 TryBot-Bypass: Bryan Mills Reviewed-by: Damien Neil Auto-Submit: Bryan Mills Run-TryBot: Bryan Mills --- src/net/hook_plan9.go | 9 --- src/net/hook_unix.go | 1 - src/net/hook_windows.go | 3 - src/net/ipsock_plan9.go | 1 - src/net/timeout_test.go | 143 +++++++++++++++++++++++++++------------- 5 files changed, 97 insertions(+), 60 deletions(-) delete mode 100644 src/net/hook_plan9.go diff --git a/src/net/hook_plan9.go b/src/net/hook_plan9.go deleted file mode 100644 index e053348505..0000000000 --- a/src/net/hook_plan9.go +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package net - -import "time" - -var testHookDialChannel = func() { time.Sleep(time.Millisecond) } // see golang.org/issue/5349 diff --git a/src/net/hook_unix.go b/src/net/hook_unix.go index 4e20f59218..89cb404c83 100644 --- a/src/net/hook_unix.go +++ b/src/net/hook_unix.go @@ -9,7 +9,6 @@ package net import "syscall" var ( - testHookDialChannel = func() {} // for golang.org/issue/5349 testHookCanceledDial = func() {} // for golang.org/issue/16523 // Placeholders for socket system calls. diff --git a/src/net/hook_windows.go b/src/net/hook_windows.go index 28c49cc6de..946e1efa8d 100644 --- a/src/net/hook_windows.go +++ b/src/net/hook_windows.go @@ -7,12 +7,9 @@ package net import ( "internal/syscall/windows" "syscall" - "time" ) var ( - testHookDialChannel = func() { time.Sleep(time.Millisecond) } // see golang.org/issue/5349 - // Placeholders for socket system calls. wsaSocketFunc func(int32, int32, int32, *syscall.WSAProtocolInfo, uint32, uint32) (syscall.Handle, error) = windows.WSASocket connectFunc func(syscall.Handle, syscall.Sockaddr) error = syscall.Connect diff --git a/src/net/ipsock_plan9.go b/src/net/ipsock_plan9.go index 43287431c8..c8d0180436 100644 --- a/src/net/ipsock_plan9.go +++ b/src/net/ipsock_plan9.go @@ -181,7 +181,6 @@ func dialPlan9(ctx context.Context, net string, laddr, raddr Addr) (fd *netFD, e } resc := make(chan res) go func() { - testHookDialChannel() fd, err := dialPlan9Blocking(ctx, net, laddr, raddr) select { case resc <- res{fd, err}: diff --git a/src/net/timeout_test.go b/src/net/timeout_test.go index c0bce57b94..4218025fc0 100644 --- a/src/net/timeout_test.go +++ b/src/net/timeout_test.go @@ -11,9 +11,9 @@ import ( "fmt" "internal/testenv" "io" - "net/internal/socktest" "os" "runtime" + "strings" "sync" "testing" "time" @@ -22,62 +22,113 @@ import ( var dialTimeoutTests = []struct { timeout time.Duration delta time.Duration // for deadline - - guard time.Duration }{ // Tests that dial timeouts, deadlines in the past work. - {-5 * time.Second, 0, -5 * time.Second}, - {0, -5 * time.Second, -5 * time.Second}, - {-5 * time.Second, 5 * time.Second, -5 * time.Second}, // timeout over deadline - {-1 << 63, 0, time.Second}, - {0, -1 << 63, time.Second}, - - {50 * time.Millisecond, 0, 100 * time.Millisecond}, - {0, 50 * time.Millisecond, 100 * time.Millisecond}, - {50 * time.Millisecond, 5 * time.Second, 100 * time.Millisecond}, // timeout over deadline + {-5 * time.Second, 0}, + {0, -5 * time.Second}, + {-5 * time.Second, 5 * time.Second}, // timeout over deadline + {-1 << 63, 0}, + {0, -1 << 63}, + + {1 * time.Millisecond, 0}, + {0, 1 * time.Millisecond}, + {1 * time.Millisecond, 5 * time.Second}, // timeout over deadline } func TestDialTimeout(t *testing.T) { - // Cannot use t.Parallel - modifies global hooks. - origTestHookDialChannel := testHookDialChannel - defer func() { testHookDialChannel = origTestHookDialChannel }() - defer sw.Set(socktest.FilterConnect, nil) - - for i, tt := range dialTimeoutTests { - switch runtime.GOOS { - case "plan9", "windows": - testHookDialChannel = func() { time.Sleep(tt.guard) } - if runtime.GOOS == "plan9" { - break - } - fallthrough - default: - sw.Set(socktest.FilterConnect, func(so *socktest.Status) (socktest.AfterFilter, error) { - time.Sleep(tt.guard) - return nil, errTimedout - }) - } + switch runtime.GOOS { + case "plan9": + t.Skipf("not supported on %s", runtime.GOOS) + } + + t.Parallel() - d := Dialer{Timeout: tt.timeout} - if tt.delta != 0 { - d.Deadline = time.Now().Add(tt.delta) + ln := newLocalListener(t, "tcp") + defer func() { + if err := ln.Close(); err != nil { + t.Error(err) } + }() - // This dial never starts to send any TCP SYN - // segment because of above socket filter and - // test hook. - c, err := d.Dial("tcp", "127.0.0.1:0") - if err == nil { - err = fmt.Errorf("unexpectedly established: tcp:%s->%s", c.LocalAddr(), c.RemoteAddr()) + // We expect the kernel to spuriously accept some number of connections on + // behalf of the listener, even when it hasn't called Accept yet. + var bufferedConns []Conn + t.Cleanup(func() { + t.Logf("ignored %d spurious connections", len(bufferedConns)) + for _, c := range bufferedConns { c.Close() } + }) - if perr := parseDialError(err); perr != nil { - t.Errorf("#%d: %v", i, perr) - } - if nerr, ok := err.(Error); !ok || !nerr.Timeout() { - t.Fatalf("#%d: %v", i, err) - } + for _, tt := range dialTimeoutTests { + t.Run(fmt.Sprintf("%v/%v", tt.timeout, tt.delta), func(t *testing.T) { + // We don't run these subtests in parallel because (at least on Linux) + // that empirically causes many of the Dial calls to fail with + // ECONNREFUSED instead of a timeout error. + d := Dialer{Timeout: tt.timeout} + if tt.delta != 0 { + d.Deadline = time.Now().Add(tt.delta) + } + + var ( + beforeDial time.Time + afterDial time.Time + err error + ) + for err == nil { + beforeDial = time.Now() + var c Conn + c, err = d.Dial(ln.Addr().Network(), ln.Addr().String()) + afterDial = time.Now() + if err == nil { + // The connection was accepted before the timeout took effect; leave + // the connection open and try again. Eventually we will have so many + // open connections that the kernel stops buffering new ones, in which + // case the Dial calls should start to time out and return errors. + bufferedConns = append(bufferedConns, c) + } + } + + if strings.Contains(err.Error(), "connection reset by peer") && (testenv.Builder() == "" || runtime.GOOS == "freebsd") { + // After we set up the connection on Unix, we make a call to + // getsockopt to retrieve its status. Empirically, on some platforms + // (notably FreeBSD 13), we may see ECONNRESET from that call instead + // of a timeout when the listener's accept queue is full. + // + // We don't retry ECONNRESET errors in the saturation loop above, + // because there is no upper bound on how often they will occur. + // Empirically, with a 1ms timeout a single run of the test could + // provoke upward of 100k ECONNRESETS, running for over 15s before + // it finally trigged a timeout. + // + // We record this as a skipped subtest rather than a passing test so + // that we can (potentially, one day) analyze it as such: this test + // didn't fail, but it also didn't successfully provoke the intended + // timeout behavior. + // + // We don't allow this on Go builders other than the freebsd builder + // because we're not aware of any other platforms with this behavior, + // and if the test suddenly starts skipping on other platforms we want + // to know about it so that we can fix either the test or our Dial + // implementation. + t.Logf("Dial: %v", err) + t.Skipf("skipping due to ECONNRESET with full accept queue") + } + + if d.Deadline.IsZero() || afterDial.Before(d.Deadline) { + delay := afterDial.Sub(beforeDial) + if delay < tt.timeout { + t.Errorf("Dial returned after %v; want ≥%v", delay, tt.timeout) + } + } + + if perr := parseDialError(err); perr != nil { + t.Errorf("unexpected error from Dial: %v", perr) + } + if nerr, ok := err.(Error); !ok || !nerr.Timeout() { + t.Errorf("Dial: %v, want timeout", err) + } + }) } } -- 2.48.1