misc/ios: retry loop to handle builder flakiness

author David Crawshaw <crawshaw@golang.org>

Mon, 30 Mar 2015 12:36:37 +0000 (08:36 -0400)

committer David Crawshaw <crawshaw@golang.org>

Mon, 30 Mar 2015 18:11:07 +0000 (18:11 +0000)
author David Crawshaw <crawshaw@golang.org>
Mon, 30 Mar 2015 12:36:37 +0000 (08:36 -0400)
committer David Crawshaw <crawshaw@golang.org>
Mon, 30 Mar 2015 18:11:07 +0000 (18:11 +0000)
diff --git a/misc/ios/go_darwin_arm_exec.go b/misc/ios/go_darwin_arm_exec.go

index 431ddcc70bf7a5b79e82e6bcdc6afceacb978e56..4495f52c90d344f75d1286bb5963c3bf58b28446 100644 (file)
--- a/misc/ios/go_darwin_arm_exec.go
+++ b/misc/ios/go_darwin_arm_exec.go
@@ -26,6 +26,10 @@ import (
  
  const debug = false
  
+var errRetry = errors.New("failed to start test harness (retry attempted)")
+
+var tmpdir string
+
  func main() {
         log.SetFlags(0)
         log.SetPrefix("go_darwin_arm_exec: ")
@@ -36,39 +40,39 @@ func main() {
                 log.Fatal("usage: go_darwin_arm_exec a.out")
         }
  
-       if err := run(os.Args[1], os.Args[2:]); err != nil {
-               fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err)
-               os.Exit(1)
+       var err error
+       tmpdir, err = ioutil.TempDir("", "go_darwin_arm_exec_")
+       if err != nil {
+               log.Fatal(err)
         }
-}
  
-func run(bin string, args []string) (err error) {
-       type waitPanic struct {
-               err error
-       }
-       defer func() {
-               if r := recover(); r != nil {
-                       if w, ok := r.(waitPanic); ok {
-                               err = w.err
-                               return
-                       }
-                       panic(r)
+       // Approximately 1 in a 100 binaries fail to start. If it happens,
+       // try again. These failures happen for several reasons beyond
+       // our control, but all of them are safe to retry as they happen
+       // before lldb encounters the initial getwd breakpoint. As we
+       // know the tests haven't started, we are not hiding flaky tests
+       // with this retry.
+       for i := 0; i < 5; i++ {
+               if i > 0 {
+                       fmt.Fprintln(os.Stderr, "start timeout, trying again")
+               }
+               err = run(os.Args[1], os.Args[2:])
+               if err == nil || err != errRetry {
+                       break
                 }
-       }()
-
-       defer exec.Command("killall", "ios-deploy").Run() // cleanup
-
-       exec.Command("killall", "ios-deploy").Run()
-
-       tmpdir, err := ioutil.TempDir("", "go_darwin_arm_exec_")
-       if err != nil {
-               log.Fatal(err)
         }
         if !debug {
-               defer os.RemoveAll(tmpdir)
+               os.RemoveAll(tmpdir)
         }
+       if err != nil {
+               fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err)
+               os.Exit(1)
+       }
+}
  
+func run(bin string, args []string) (err error) {
         appdir := filepath.Join(tmpdir, "gotest.app")
+       os.RemoveAll(appdir)
         if err := os.MkdirAll(appdir, 0755); err != nil {
                 return err
         }
@@ -109,9 +113,31 @@ func run(bin string, args []string) (err error) {
                 return fmt.Errorf("codesign: %v", err)
         }
  
-       if err := os.Chdir(tmpdir); err != nil {
+       oldwd, err := os.Getwd()
+       if err != nil {
+               return err
+       }
+       if err := os.Chdir(filepath.Join(appdir, "..")); err != nil {
                 return err
         }
+       defer os.Chdir(oldwd)
+
+       type waitPanic struct {
+               err error
+       }
+       defer func() {
+               if r := recover(); r != nil {
+                       if w, ok := r.(waitPanic); ok {
+                               err = w.err
+                               return
+                       }
+                       panic(r)
+               }
+       }()
+
+       defer exec.Command("killall", "ios-deploy").Run() // cleanup
+
+       exec.Command("killall", "ios-deploy").Run()
  
         // ios-deploy invokes lldb to give us a shell session with the app.
         cmd = exec.Command(
@@ -175,11 +201,11 @@ func run(bin string, args []string) (err error) {
                         w.printBuf()
                         return fmt.Errorf("failed (stage %s): %v", stage, err)
                 case i := <-w.find(str, timeout):
-                       if i >= 0 {
-                               w.clearTo(i + len(str))
-                       } else {
-                               log.Printf("timed out on stage %s, continuing", stage)
+                       if i < 0 {
+                               log.Printf("timed out on stage %q, retrying", stage)
+                               return errRetry
                         }
+                       w.clearTo(i + len(str))
                         return nil
                 }
         }
@@ -192,7 +218,11 @@ func run(bin string, args []string) (err error) {
  
         // Wait for installation and connection.
         if err := waitFor("ios-deploy before run", "(lldb)     connect\r\nProcess 0 connected\r\n", 0); err != nil {
-               return err
+               // Retry if we see a rare and longstanding ios-deploy bug.
+               // https://github.com/phonegap/ios-deploy/issues/11
+               //      Assertion failed: (AMDeviceStartService(device, CFSTR("com.apple.debugserver"), &gdbfd, NULL) == 0)
+               log.Printf("%v, retrying", err)
+               return errRetry
         }
  
         // Script LLDB. Oh dear.
@@ -205,9 +235,21 @@ func run(bin string, args []string) (err error) {
         do(`breakpoint set -n getwd`) // in runtime/cgo/gcc_darwin_arm.go
  
         fmt.Fprintln(lldb, `run`)
-       // Sometimes we don't see "reason = breakpoint", so we time out
-       // and try to continue.
-       if err := waitFor("br getwd", "stop reason = breakpoint", 10*time.Second); err != nil {
+       if err := waitFor("br getwd", "stop reason = breakpoint", 20*time.Second); err != nil {
+               // At this point we see several flaky errors from the iOS
+               // build infrastructure. The most common is never reaching
+               // the breakpoint, which we catch with a timeout. Very
+               // occasionally lldb can produce errors like:
+               //
+               //      Breakpoint 1: no locations (pending).
+               //      WARNING:  Unable to resolve breakpoint to any actual locations.
+               //
+               // As no actual test code has been executed by this point,
+               // we treat all errors as recoverable.
+               if err != errRetry {
+                       log.Printf("%v, retrying", err)
+                       err = errRetry
+               }
                 return err
         }
         if err := waitFor("br getwd prompt", "(lldb)", 0); err != nil {
author	David Crawshaw <crawshaw@golang.org>
	Mon, 30 Mar 2015 12:36:37 +0000 (08:36 -0400)
committer	David Crawshaw <crawshaw@golang.org>
	Mon, 30 Mar 2015 18:11:07 +0000 (18:11 +0000)