From: David Crawshaw Date: Mon, 30 Mar 2015 12:36:37 +0000 (-0400) Subject: misc/ios: retry loop to handle builder flakiness X-Git-Tag: go1.5beta1~1381 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=00e0fe4b95d0ebaf17bab86795337015e476b3fc;p=gostls13.git misc/ios: retry loop to handle builder flakiness After moving the darwin/arm builder to new hardware several new flaky error messages appeared. This provided enough information to Google to make it clear that iOS build systems have been flaky for many years, and that is unlikely to change any time soon. However, all of the pain of lldb and using a breakpoint early in program initialization gives us an advantage: all install and initialization flakiness appears to happen before the Go program ever gets going. So if we see an error or we timeout before we reach our breakpoint (before any test code has executed), we can assume it is the fault of the builder and restart without risking hiding a flaky Go test. This code has successfully processed the last 8 builds. I am hopeful. Change-Id: Ide24aaae4fa7bdab9d8f4432bb85d8f2256c7606 Reviewed-on: https://go-review.googlesource.com/8241 Reviewed-by: Hyang-Ah Hana Kim --- diff --git a/misc/ios/go_darwin_arm_exec.go b/misc/ios/go_darwin_arm_exec.go index 431ddcc70b..4495f52c90 100644 --- a/misc/ios/go_darwin_arm_exec.go +++ b/misc/ios/go_darwin_arm_exec.go @@ -26,6 +26,10 @@ import ( const debug = false +var errRetry = errors.New("failed to start test harness (retry attempted)") + +var tmpdir string + func main() { log.SetFlags(0) log.SetPrefix("go_darwin_arm_exec: ") @@ -36,39 +40,39 @@ func main() { log.Fatal("usage: go_darwin_arm_exec a.out") } - if err := run(os.Args[1], os.Args[2:]); err != nil { - fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err) - os.Exit(1) + var err error + tmpdir, err = ioutil.TempDir("", "go_darwin_arm_exec_") + if err != nil { + log.Fatal(err) } -} -func run(bin string, args []string) (err error) { - type waitPanic struct { - err error - } - defer func() { - if r := recover(); r != nil { - if w, ok := r.(waitPanic); ok { - err = w.err - return - } - panic(r) + // Approximately 1 in a 100 binaries fail to start. If it happens, + // try again. These failures happen for several reasons beyond + // our control, but all of them are safe to retry as they happen + // before lldb encounters the initial getwd breakpoint. As we + // know the tests haven't started, we are not hiding flaky tests + // with this retry. + for i := 0; i < 5; i++ { + if i > 0 { + fmt.Fprintln(os.Stderr, "start timeout, trying again") + } + err = run(os.Args[1], os.Args[2:]) + if err == nil || err != errRetry { + break } - }() - - defer exec.Command("killall", "ios-deploy").Run() // cleanup - - exec.Command("killall", "ios-deploy").Run() - - tmpdir, err := ioutil.TempDir("", "go_darwin_arm_exec_") - if err != nil { - log.Fatal(err) } if !debug { - defer os.RemoveAll(tmpdir) + os.RemoveAll(tmpdir) } + if err != nil { + fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err) + os.Exit(1) + } +} +func run(bin string, args []string) (err error) { appdir := filepath.Join(tmpdir, "gotest.app") + os.RemoveAll(appdir) if err := os.MkdirAll(appdir, 0755); err != nil { return err } @@ -109,9 +113,31 @@ func run(bin string, args []string) (err error) { return fmt.Errorf("codesign: %v", err) } - if err := os.Chdir(tmpdir); err != nil { + oldwd, err := os.Getwd() + if err != nil { + return err + } + if err := os.Chdir(filepath.Join(appdir, "..")); err != nil { return err } + defer os.Chdir(oldwd) + + type waitPanic struct { + err error + } + defer func() { + if r := recover(); r != nil { + if w, ok := r.(waitPanic); ok { + err = w.err + return + } + panic(r) + } + }() + + defer exec.Command("killall", "ios-deploy").Run() // cleanup + + exec.Command("killall", "ios-deploy").Run() // ios-deploy invokes lldb to give us a shell session with the app. cmd = exec.Command( @@ -175,11 +201,11 @@ func run(bin string, args []string) (err error) { w.printBuf() return fmt.Errorf("failed (stage %s): %v", stage, err) case i := <-w.find(str, timeout): - if i >= 0 { - w.clearTo(i + len(str)) - } else { - log.Printf("timed out on stage %s, continuing", stage) + if i < 0 { + log.Printf("timed out on stage %q, retrying", stage) + return errRetry } + w.clearTo(i + len(str)) return nil } } @@ -192,7 +218,11 @@ func run(bin string, args []string) (err error) { // Wait for installation and connection. if err := waitFor("ios-deploy before run", "(lldb) connect\r\nProcess 0 connected\r\n", 0); err != nil { - return err + // Retry if we see a rare and longstanding ios-deploy bug. + // https://github.com/phonegap/ios-deploy/issues/11 + // Assertion failed: (AMDeviceStartService(device, CFSTR("com.apple.debugserver"), &gdbfd, NULL) == 0) + log.Printf("%v, retrying", err) + return errRetry } // Script LLDB. Oh dear. @@ -205,9 +235,21 @@ func run(bin string, args []string) (err error) { do(`breakpoint set -n getwd`) // in runtime/cgo/gcc_darwin_arm.go fmt.Fprintln(lldb, `run`) - // Sometimes we don't see "reason = breakpoint", so we time out - // and try to continue. - if err := waitFor("br getwd", "stop reason = breakpoint", 10*time.Second); err != nil { + if err := waitFor("br getwd", "stop reason = breakpoint", 20*time.Second); err != nil { + // At this point we see several flaky errors from the iOS + // build infrastructure. The most common is never reaching + // the breakpoint, which we catch with a timeout. Very + // occasionally lldb can produce errors like: + // + // Breakpoint 1: no locations (pending). + // WARNING: Unable to resolve breakpoint to any actual locations. + // + // As no actual test code has been executed by this point, + // we treat all errors as recoverable. + if err != errRetry { + log.Printf("%v, retrying", err) + err = errRetry + } return err } if err := waitFor("br getwd prompt", "(lldb)", 0); err != nil {