]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: redo writebarrier pass
authorCherry Zhang <cherryyz@google.com>
Wed, 1 Feb 2017 19:27:40 +0000 (14:27 -0500)
committerCherry Zhang <cherryyz@google.com>
Fri, 17 Feb 2017 19:20:25 +0000 (19:20 +0000)
SSA's writebarrier pass requires WB store ops are always at the
end of a block. If we move write barrier insertion into SSA and
emits normal Store ops when building SSA, this requirement becomes
impractical -- it will create too many blocks for all the Store
ops.

Redo SSA's writebarrier pass, explicitly order values in store
order, so it no longer needs this requirement.

Updates #17583.
Fixes #19067.

Change-Id: I66e817e526affb7e13517d4245905300a90b7170
Reviewed-on: https://go-review.googlesource.com/36834
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
src/cmd/compile/internal/gc/ssa.go
src/cmd/compile/internal/ssa/nilcheck.go
src/cmd/compile/internal/ssa/schedule.go
src/cmd/compile/internal/ssa/writebarrier.go

index b9b3b80b52785aa5b96544b05aaf7d08b349a123..78a1f6b48c4c003f6dc6d423bee55ffe33b505f9 100644 (file)
@@ -3438,14 +3438,6 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, rightIsVolatile bo
        }
        val.Aux = &ssa.ExternSymbol{Typ: Types[TUINTPTR], Sym: Linksym(typenamesym(t))}
        s.vars[&memVar] = val
-
-       // WB ops will be expanded to branches at writebarrier phase.
-       // To make it easy, we put WB ops at the end of a block, so
-       // that it does not need to split a block into two parts when
-       // expanding WB ops.
-       b := s.f.NewBlock(ssa.BlockPlain)
-       s.endBlock().AddEdgeTo(b)
-       s.startBlock(b)
 }
 
 // insertWBstore inserts the assignment *left = right including a write barrier.
@@ -3466,14 +3458,6 @@ func (s *state) insertWBstore(t *Type, left, right *ssa.Value, skip skipMask) {
        }
        s.storeTypeScalars(t, left, right, skip)
        s.storeTypePtrsWB(t, left, right)
-
-       // WB ops will be expanded to branches at writebarrier phase.
-       // To make it easy, we put WB ops at the end of a block, so
-       // that it does not need to split a block into two parts when
-       // expanding WB ops.
-       b := s.f.NewBlock(ssa.BlockPlain)
-       s.endBlock().AddEdgeTo(b)
-       s.startBlock(b)
 }
 
 // do *left = right for all scalar (non-pointer) parts of t.
index aa6424fe41118917886a77a20d5c3440d37c0595..ea6523d24c47ba207f2c8a39bfb345e56a04cb99 100644 (file)
@@ -227,163 +227,3 @@ func nilcheckelim2(f *Func) {
                // more unnecessary nil checks.  Would fix test/nilptr3_ssa.go:157.
        }
 }
-
-// storeOrder orders values with respect to stores. That is,
-// if v transitively depends on store s, v is ordered after s,
-// otherwise v is ordered before s.
-// Specifically, values are ordered like
-//   store1
-//   NilCheck that depends on store1
-//   other values that depends on store1
-//   store2
-//   NilCheck that depends on store2
-//   other values that depends on store2
-//   ...
-// The order of non-store and non-NilCheck values are undefined
-// (not necessarily dependency order). This should be cheaper
-// than a full scheduling as done in schedule.go.
-// Note that simple dependency order won't work: there is no
-// dependency between NilChecks and values like IsNonNil.
-// Auxiliary data structures are passed in as arguments, so
-// that they can be allocated in the caller and be reused.
-// This function takes care of reset them.
-func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value {
-       // find all stores
-       var stores []*Value // members of values that are store values
-       hasNilCheck := false
-       sset.clear() // sset is the set of stores that are used in other values
-       for _, v := range values {
-               if v.Type.IsMemory() {
-                       stores = append(stores, v)
-                       if v.Op == OpInitMem || v.Op == OpPhi {
-                               continue
-                       }
-                       a := v.Args[len(v.Args)-1]
-                       if v.Op == OpSelect1 {
-                               a = a.Args[len(a.Args)-1]
-                       }
-                       sset.add(a.ID) // record that a is used
-               }
-               if v.Op == OpNilCheck {
-                       hasNilCheck = true
-               }
-       }
-       if len(stores) == 0 || !hasNilCheck {
-               // there is no store or nilcheck, the order does not matter
-               return values
-       }
-
-       f := stores[0].Block.Func
-
-       // find last store, which is the one that is not used by other stores
-       var last *Value
-       for _, v := range stores {
-               if !sset.contains(v.ID) {
-                       if last != nil {
-                               f.Fatalf("two stores live simutaneously: %v and %v", v, last)
-                       }
-                       last = v
-               }
-       }
-
-       // We assign a store number to each value. Store number is the
-       // index of the latest store that this value transitively depends.
-       // The i-th store in the current block gets store number 3*i. A nil
-       // check that depends on the i-th store gets store number 3*i+1.
-       // Other values that depends on the i-th store gets store number 3*i+2.
-       // Special case: 0 -- unassigned, 1 or 2 -- the latest store it depends
-       // is in the previous block (or no store at all, e.g. value is Const).
-       // First we assign the number to all stores by walking back the store chain,
-       // then assign the number to other values in DFS order.
-       count := make([]int32, 3*(len(stores)+1))
-       sset.clear() // reuse sparse set to ensure that a value is pushed to stack only once
-       for n, w := len(stores), last; n > 0; n-- {
-               storeNumber[w.ID] = int32(3 * n)
-               count[3*n]++
-               sset.add(w.ID)
-               if w.Op == OpInitMem || w.Op == OpPhi {
-                       if n != 1 {
-                               f.Fatalf("store order is wrong: there are stores before %v", w)
-                       }
-                       break
-               }
-               if w.Op == OpSelect1 {
-                       w = w.Args[0]
-               }
-               w = w.Args[len(w.Args)-1]
-       }
-       var stack []*Value
-       for _, v := range values {
-               if sset.contains(v.ID) {
-                       // in sset means v is a store, or already pushed to stack, or already assigned a store number
-                       continue
-               }
-               stack = append(stack, v)
-               sset.add(v.ID)
-
-               for len(stack) > 0 {
-                       w := stack[len(stack)-1]
-                       if storeNumber[w.ID] != 0 {
-                               stack = stack[:len(stack)-1]
-                               continue
-                       }
-                       if w.Op == OpPhi {
-                               // Phi value doesn't depend on store in the current block.
-                               // Do this early to avoid dependency cycle.
-                               storeNumber[w.ID] = 2
-                               count[2]++
-                               stack = stack[:len(stack)-1]
-                               continue
-                       }
-
-                       max := int32(0) // latest store dependency
-                       argsdone := true
-                       for _, a := range w.Args {
-                               if a.Block != w.Block {
-                                       continue
-                               }
-                               if !sset.contains(a.ID) {
-                                       stack = append(stack, a)
-                                       sset.add(a.ID)
-                                       argsdone = false
-                                       continue
-                               }
-                               if storeNumber[a.ID]/3 > max {
-                                       max = storeNumber[a.ID] / 3
-                               }
-                       }
-                       if !argsdone {
-                               continue
-                       }
-
-                       n := 3*max + 2
-                       if w.Op == OpNilCheck {
-                               n = 3*max + 1
-                       }
-                       storeNumber[w.ID] = n
-                       count[n]++
-                       stack = stack[:len(stack)-1]
-               }
-       }
-
-       // convert count to prefix sum of counts: count'[i] = sum_{j<=i} count[i]
-       for i := range count {
-               if i == 0 {
-                       continue
-               }
-               count[i] += count[i-1]
-       }
-       if count[len(count)-1] != int32(len(values)) {
-               f.Fatalf("storeOrder: value is missing, total count = %d, values = %v", count[len(count)-1], values)
-       }
-
-       // place values in count-indexed bins, which are in the desired store order
-       order := make([]*Value, len(values))
-       for _, v := range values {
-               s := storeNumber[v.ID]
-               order[count[s-1]] = v
-               count[s-1]++
-       }
-
-       return order
-}
index bd4d3299f28c2c6eefee2bebd2db482e57e77fb0..35edd77b8dd9520503eb51546cff2cd718006714 100644 (file)
@@ -277,3 +277,167 @@ func schedule(f *Func) {
 
        f.scheduled = true
 }
+
+// storeOrder orders values with respect to stores. That is,
+// if v transitively depends on store s, v is ordered after s,
+// otherwise v is ordered before s.
+// Specifically, values are ordered like
+//   store1
+//   NilCheck that depends on store1
+//   other values that depends on store1
+//   store2
+//   NilCheck that depends on store2
+//   other values that depends on store2
+//   ...
+// The order of non-store and non-NilCheck values are undefined
+// (not necessarily dependency order). This should be cheaper
+// than a full scheduling as done above.
+// Note that simple dependency order won't work: there is no
+// dependency between NilChecks and values like IsNonNil.
+// Auxiliary data structures are passed in as arguments, so
+// that they can be allocated in the caller and be reused.
+// This function takes care of reset them.
+func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value {
+       if len(values) == 0 {
+               return values
+       }
+
+       f := values[0].Block.Func
+
+       // find all stores
+       var stores []*Value // members of values that are store values
+       hasNilCheck := false
+       sset.clear() // sset is the set of stores that are used in other values
+       for _, v := range values {
+               if v.Type.IsMemory() {
+                       stores = append(stores, v)
+                       if v.Op == OpInitMem || v.Op == OpPhi {
+                               continue
+                       }
+                       a := v.Args[len(v.Args)-1]
+                       if v.Op == OpSelect1 {
+                               a = a.Args[len(a.Args)-1]
+                       }
+                       sset.add(a.ID) // record that a is used
+               }
+               if v.Op == OpNilCheck {
+                       hasNilCheck = true
+               }
+       }
+       if len(stores) == 0 || !hasNilCheck && f.pass.name == "nilcheckelim" {
+               // there is no store, the order does not matter
+               return values
+       }
+
+       // find last store, which is the one that is not used by other stores
+       var last *Value
+       for _, v := range stores {
+               if !sset.contains(v.ID) {
+                       if last != nil {
+                               f.Fatalf("two stores live simutaneously: %v and %v", v, last)
+                       }
+                       last = v
+               }
+       }
+
+       // We assign a store number to each value. Store number is the
+       // index of the latest store that this value transitively depends.
+       // The i-th store in the current block gets store number 3*i. A nil
+       // check that depends on the i-th store gets store number 3*i+1.
+       // Other values that depends on the i-th store gets store number 3*i+2.
+       // Special case: 0 -- unassigned, 1 or 2 -- the latest store it depends
+       // is in the previous block (or no store at all, e.g. value is Const).
+       // First we assign the number to all stores by walking back the store chain,
+       // then assign the number to other values in DFS order.
+       count := make([]int32, 3*(len(stores)+1))
+       sset.clear() // reuse sparse set to ensure that a value is pushed to stack only once
+       for n, w := len(stores), last; n > 0; n-- {
+               storeNumber[w.ID] = int32(3 * n)
+               count[3*n]++
+               sset.add(w.ID)
+               if w.Op == OpInitMem || w.Op == OpPhi {
+                       if n != 1 {
+                               f.Fatalf("store order is wrong: there are stores before %v", w)
+                       }
+                       break
+               }
+               if w.Op == OpSelect1 {
+                       w = w.Args[0]
+               }
+               w = w.Args[len(w.Args)-1]
+       }
+       var stack []*Value
+       for _, v := range values {
+               if sset.contains(v.ID) {
+                       // in sset means v is a store, or already pushed to stack, or already assigned a store number
+                       continue
+               }
+               stack = append(stack, v)
+               sset.add(v.ID)
+
+               for len(stack) > 0 {
+                       w := stack[len(stack)-1]
+                       if storeNumber[w.ID] != 0 {
+                               stack = stack[:len(stack)-1]
+                               continue
+                       }
+                       if w.Op == OpPhi {
+                               // Phi value doesn't depend on store in the current block.
+                               // Do this early to avoid dependency cycle.
+                               storeNumber[w.ID] = 2
+                               count[2]++
+                               stack = stack[:len(stack)-1]
+                               continue
+                       }
+
+                       max := int32(0) // latest store dependency
+                       argsdone := true
+                       for _, a := range w.Args {
+                               if a.Block != w.Block {
+                                       continue
+                               }
+                               if !sset.contains(a.ID) {
+                                       stack = append(stack, a)
+                                       sset.add(a.ID)
+                                       argsdone = false
+                                       continue
+                               }
+                               if storeNumber[a.ID]/3 > max {
+                                       max = storeNumber[a.ID] / 3
+                               }
+                       }
+                       if !argsdone {
+                               continue
+                       }
+
+                       n := 3*max + 2
+                       if w.Op == OpNilCheck {
+                               n = 3*max + 1
+                       }
+                       storeNumber[w.ID] = n
+                       count[n]++
+                       stack = stack[:len(stack)-1]
+               }
+       }
+
+       // convert count to prefix sum of counts: count'[i] = sum_{j<=i} count[i]
+       for i := range count {
+               if i == 0 {
+                       continue
+               }
+               count[i] += count[i-1]
+       }
+       if count[len(count)-1] != int32(len(values)) {
+               f.Fatalf("storeOrder: value is missing, total count = %d, values = %v", count[len(count)-1], values)
+       }
+
+       // place values in count-indexed bins, which are in the desired store order
+       order := make([]*Value, len(values))
+       for _, v := range values {
+               s := storeNumber[v.ID]
+               order[count[s-1]] = v
+               count[s-1]++
+       }
+
+       return order
+}
index 899e4fadede6d2f09821b7f9b5cf52b40b9d2176..d2539bd3b06d321b12a507652c3f8e0660a91a2c 100644 (file)
@@ -22,21 +22,18 @@ import (
 // and a normal store will be used.
 // A sequence of WB stores for many pointer fields of a single type will
 // be emitted together, with a single branch.
-//
-// Expanding WB ops introduces new control flows, and we would need to
-// split a block into two if there were values after WB ops, which would
-// require scheduling the values. To avoid this complexity, when building
-// SSA, we make sure that WB ops are always at the end of a block. We do
-// this before fuse as it may merge blocks. It also helps to reduce
-// number of blocks as fuse merges blocks introduced in this phase.
 func writebarrier(f *Func) {
-       var sb, sp, wbaddr *Value
+       var sb, sp, wbaddr, const0 *Value
        var writebarrierptr, typedmemmove, typedmemclr *obj.LSym
-       var storeWBs, others []*Value
-       var wbs *sparseSet
-       for _, b := range f.Blocks { // range loop is safe since the blocks we added contain no WB stores
-       valueLoop:
-               for i, v := range b.Values {
+       var stores, after []*Value
+       var sset *sparseSet
+       var storeNumber []int32
+
+       for _, b := range f.Blocks { // range loop is safe since the blocks we added contain no stores to expand
+               // rewrite write barrier for stack writes to ordinary Store/Move/Zero,
+               // record presence of non-stack WB ops.
+               hasStore := false
+               for _, v := range b.Values {
                        switch v.Op {
                        case OpStoreWB, OpMoveWB, OpMoveWBVolatile, OpZeroWB:
                                if IsStackAddr(v.Args[0]) {
@@ -52,187 +49,182 @@ func writebarrier(f *Func) {
                                        }
                                        continue
                                }
+                               hasStore = true
+                               break
+                       }
+               }
+               if !hasStore {
+                       continue
+               }
 
-                               if wbaddr == nil {
-                                       // initalize global values for write barrier test and calls
-                                       // find SB and SP values in entry block
-                                       initln := f.Entry.Pos
-                                       for _, v := range f.Entry.Values {
-                                               if v.Op == OpSB {
-                                                       sb = v
-                                               }
-                                               if v.Op == OpSP {
-                                                       sp = v
-                                               }
-                                       }
-                                       if sb == nil {
-                                               sb = f.Entry.NewValue0(initln, OpSB, f.Config.fe.TypeUintptr())
-                                       }
-                                       if sp == nil {
-                                               sp = f.Entry.NewValue0(initln, OpSP, f.Config.fe.TypeUintptr())
-                                       }
-                                       wbsym := &ExternSymbol{Typ: f.Config.fe.TypeBool(), Sym: f.Config.fe.Syslook("writeBarrier")}
-                                       wbaddr = f.Entry.NewValue1A(initln, OpAddr, f.Config.fe.TypeUInt32().PtrTo(), wbsym, sb)
-                                       writebarrierptr = f.Config.fe.Syslook("writebarrierptr")
-                                       typedmemmove = f.Config.fe.Syslook("typedmemmove")
-                                       typedmemclr = f.Config.fe.Syslook("typedmemclr")
-
-                                       wbs = f.newSparseSet(f.NumValues())
-                                       defer f.retSparseSet(wbs)
+               if wbaddr == nil {
+                       // lazily initialize global values for write barrier test and calls
+                       // find SB and SP values in entry block
+                       initpos := f.Entry.Pos
+                       for _, v := range f.Entry.Values {
+                               if v.Op == OpSB {
+                                       sb = v
+                               }
+                               if v.Op == OpSP {
+                                       sp = v
                                }
+                               if sb != nil && sp != nil {
+                                       break
+                               }
+                       }
+                       if sb == nil {
+                               sb = f.Entry.NewValue0(initpos, OpSB, f.Config.fe.TypeUintptr())
+                       }
+                       if sp == nil {
+                               sp = f.Entry.NewValue0(initpos, OpSP, f.Config.fe.TypeUintptr())
+                       }
+                       wbsym := &ExternSymbol{Typ: f.Config.fe.TypeBool(), Sym: f.Config.fe.Syslook("writeBarrier")}
+                       wbaddr = f.Entry.NewValue1A(initpos, OpAddr, f.Config.fe.TypeUInt32().PtrTo(), wbsym, sb)
+                       writebarrierptr = f.Config.fe.Syslook("writebarrierptr")
+                       typedmemmove = f.Config.fe.Syslook("typedmemmove")
+                       typedmemclr = f.Config.fe.Syslook("typedmemclr")
+                       const0 = f.ConstInt32(initpos, f.Config.fe.TypeUInt32(), 0)
 
-                               pos := v.Pos
+                       // allocate auxiliary data structures for computing store order
+                       sset = f.newSparseSet(f.NumValues())
+                       defer f.retSparseSet(sset)
+                       storeNumber = make([]int32, f.NumValues())
+               }
 
-                               // there may be a sequence of WB stores in the current block. find them.
-                               storeWBs = storeWBs[:0]
-                               others = others[:0]
-                               wbs.clear()
-                               for _, w := range b.Values[i:] {
-                                       if w.Op == OpStoreWB || w.Op == OpMoveWB || w.Op == OpMoveWBVolatile || w.Op == OpZeroWB {
-                                               storeWBs = append(storeWBs, w)
-                                               wbs.add(w.ID)
-                                       } else {
-                                               others = append(others, w)
-                                       }
-                               }
+               // order values in store order
+               b.Values = storeOrder(b.Values, sset, storeNumber)
 
-                               // make sure that no value in this block depends on WB stores
-                               for _, w := range b.Values {
-                                       if w.Op == OpStoreWB || w.Op == OpMoveWB || w.Op == OpMoveWBVolatile || w.Op == OpZeroWB {
-                                               continue
-                                       }
-                                       for _, a := range w.Args {
-                                               if wbs.contains(a.ID) {
-                                                       f.Fatalf("value %v depends on WB store %v in the same block %v", w, a, b)
-                                               }
-                                       }
+       again:
+               // find the start and end of the last contiguous WB store sequence.
+               // a branch will be inserted there. values after it will be moved
+               // to a new block.
+               var last *Value
+               var start, end int
+               values := b.Values
+               for i := len(values) - 1; i >= 0; i-- {
+                       w := values[i]
+                       if w.Op == OpStoreWB || w.Op == OpMoveWB || w.Op == OpMoveWBVolatile || w.Op == OpZeroWB {
+                               if last == nil {
+                                       last = w
+                                       end = i + 1
                                }
-
-                               // find the memory before the WB stores
-                               // this memory is not a WB store but it is used in a WB store.
-                               var mem *Value
-                               for _, w := range storeWBs {
-                                       a := w.Args[len(w.Args)-1]
-                                       if wbs.contains(a.ID) {
-                                               continue
-                                       }
-                                       if mem != nil {
-                                               b.Fatalf("two stores live simultaneously: %s, %s", mem, a)
-                                       }
-                                       mem = a
+                       } else {
+                               if last != nil {
+                                       start = i + 1
+                                       break
                                }
+                       }
+               }
+               stores = append(stores[:0], b.Values[start:end]...) // copy to avoid aliasing
+               after = append(after[:0], b.Values[end:]...)
+               b.Values = b.Values[:start]
 
-                               b.Values = append(b.Values[:i], others...) // move WB ops out of this block
+               // find the memory before the WB stores
+               mem := stores[0].Args[len(stores[0].Args)-1]
+               pos := stores[0].Pos
+               bThen := f.NewBlock(BlockPlain)
+               bElse := f.NewBlock(BlockPlain)
+               bEnd := f.NewBlock(b.Kind)
+               bThen.Pos = pos
+               bElse.Pos = pos
+               bEnd.Pos = b.Pos
+               b.Pos = pos
 
-                               bThen := f.NewBlock(BlockPlain)
-                               bElse := f.NewBlock(BlockPlain)
-                               bEnd := f.NewBlock(b.Kind)
-                               bThen.Pos = pos
-                               bElse.Pos = pos
-                               bEnd.Pos = pos
+               // set up control flow for end block
+               bEnd.SetControl(b.Control)
+               bEnd.Likely = b.Likely
+               for _, e := range b.Succs {
+                       bEnd.Succs = append(bEnd.Succs, e)
+                       e.b.Preds[e.i].b = bEnd
+               }
 
-                               // set up control flow for end block
-                               bEnd.SetControl(b.Control)
-                               bEnd.Likely = b.Likely
-                               for _, e := range b.Succs {
-                                       bEnd.Succs = append(bEnd.Succs, e)
-                                       e.b.Preds[e.i].b = bEnd
-                               }
+               // set up control flow for write barrier test
+               // load word, test word, avoiding partial register write from load byte.
+               flag := b.NewValue2(pos, OpLoad, f.Config.fe.TypeUInt32(), wbaddr, mem)
+               flag = b.NewValue2(pos, OpNeq32, f.Config.fe.TypeBool(), flag, const0)
+               b.Kind = BlockIf
+               b.SetControl(flag)
+               b.Likely = BranchUnlikely
+               b.Succs = b.Succs[:0]
+               b.AddEdgeTo(bThen)
+               b.AddEdgeTo(bElse)
+               bThen.AddEdgeTo(bEnd)
+               bElse.AddEdgeTo(bEnd)
 
-                               // set up control flow for write barrier test
-                               // load word, test word, avoiding partial register write from load byte.
-                               flag := b.NewValue2(pos, OpLoad, f.Config.fe.TypeUInt32(), wbaddr, mem)
-                               const0 := f.ConstInt32(pos, f.Config.fe.TypeUInt32(), 0)
-                               flag = b.NewValue2(pos, OpNeq32, f.Config.fe.TypeBool(), flag, const0)
-                               b.Kind = BlockIf
-                               b.SetControl(flag)
-                               b.Likely = BranchUnlikely
-                               b.Succs = b.Succs[:0]
-                               b.AddEdgeTo(bThen)
-                               b.AddEdgeTo(bElse)
-                               bThen.AddEdgeTo(bEnd)
-                               bElse.AddEdgeTo(bEnd)
+               // for each write barrier store, append write barrier version to bThen
+               // and simple store version to bElse
+               memThen := mem
+               memElse := mem
+               for _, w := range stores {
+                       var val *Value
+                       ptr := w.Args[0]
+                       siz := w.AuxInt
+                       typ := w.Aux // only non-nil for MoveWB, MoveWBVolatile, ZeroWB
+                       pos = w.Pos
 
-                               memThen := mem
-                               memElse := mem
-                               for _, w := range storeWBs {
-                                       var val *Value
-                                       ptr := w.Args[0]
-                                       siz := w.AuxInt
-                                       typ := w.Aux // only non-nil for MoveWB, MoveWBVolatile, ZeroWB
+                       var op Op
+                       var fn *obj.LSym
+                       switch w.Op {
+                       case OpStoreWB:
+                               op = OpStore
+                               fn = writebarrierptr
+                               val = w.Args[1]
+                       case OpMoveWB, OpMoveWBVolatile:
+                               op = OpMove
+                               fn = typedmemmove
+                               val = w.Args[1]
+                       case OpZeroWB:
+                               op = OpZero
+                               fn = typedmemclr
+                       }
 
-                                       var op Op
-                                       var fn *obj.LSym
-                                       switch w.Op {
-                                       case OpStoreWB:
-                                               op = OpStore
-                                               fn = writebarrierptr
-                                               val = w.Args[1]
-                                       case OpMoveWB, OpMoveWBVolatile:
-                                               op = OpMove
-                                               fn = typedmemmove
-                                               val = w.Args[1]
-                                       case OpZeroWB:
-                                               op = OpZero
-                                               fn = typedmemclr
-                                       }
+                       // then block: emit write barrier call
+                       memThen = wbcall(pos, bThen, fn, typ, ptr, val, memThen, sp, sb, w.Op == OpMoveWBVolatile)
 
-                                       // then block: emit write barrier call
-                                       memThen = wbcall(pos, bThen, fn, typ, ptr, val, memThen, sp, sb, w.Op == OpMoveWBVolatile)
+                       // else block: normal store
+                       if op == OpZero {
+                               memElse = bElse.NewValue2I(pos, op, TypeMem, siz, ptr, memElse)
+                       } else {
+                               memElse = bElse.NewValue3I(pos, op, TypeMem, siz, ptr, val, memElse)
+                       }
 
-                                       // else block: normal store
-                                       if op == OpZero {
-                                               memElse = bElse.NewValue2I(pos, op, TypeMem, siz, ptr, memElse)
-                                       } else {
-                                               memElse = bElse.NewValue3I(pos, op, TypeMem, siz, ptr, val, memElse)
-                                       }
-                               }
+                       if f.Config.fe.Debug_wb() {
+                               f.Config.Warnl(pos, "write barrier")
+                       }
+               }
 
-                               // merge memory
-                               // Splice memory Phi into the last memory of the original sequence,
-                               // which may be used in subsequent blocks. Other memories in the
-                               // sequence must be dead after this block since there can be only
-                               // one memory live.
-                               last := storeWBs[0]
-                               if len(storeWBs) > 1 {
-                                       // find the last store
-                                       last = nil
-                                       wbs.clear() // we reuse wbs to record WB stores that is used in another WB store
-                                       for _, w := range storeWBs {
-                                               wbs.add(w.Args[len(w.Args)-1].ID)
-                                       }
-                                       for _, w := range storeWBs {
-                                               if wbs.contains(w.ID) {
-                                                       continue
-                                               }
-                                               if last != nil {
-                                                       b.Fatalf("two stores live simultaneously: %s, %s", last, w)
-                                               }
-                                               last = w
-                                       }
-                               }
-                               bEnd.Values = append(bEnd.Values, last)
-                               last.Block = bEnd
-                               last.reset(OpPhi)
-                               last.Type = TypeMem
-                               last.AddArg(memThen)
-                               last.AddArg(memElse)
-                               for _, w := range storeWBs {
-                                       if w != last {
-                                               w.resetArgs()
-                                       }
-                               }
-                               for _, w := range storeWBs {
-                                       if w != last {
-                                               f.freeValue(w)
-                                       }
-                               }
+               // merge memory
+               // Splice memory Phi into the last memory of the original sequence,
+               // which may be used in subsequent blocks. Other memories in the
+               // sequence must be dead after this block since there can be only
+               // one memory live.
+               bEnd.Values = append(bEnd.Values, last)
+               last.Block = bEnd
+               last.reset(OpPhi)
+               last.Type = TypeMem
+               last.AddArg(memThen)
+               last.AddArg(memElse)
+               for _, w := range stores {
+                       if w != last {
+                               w.resetArgs()
+                       }
+               }
+               for _, w := range stores {
+                       if w != last {
+                               f.freeValue(w)
+                       }
+               }
 
-                               if f.Config.fe.Debug_wb() {
-                                       f.Config.Warnl(pos, "write barrier")
-                               }
+               // put values after the store sequence into the end block
+               bEnd.Values = append(bEnd.Values, after...)
+               for _, w := range after {
+                       w.Block = bEnd
+               }
 
-                               break valueLoop
+               // if we have more stores in this block, do this block again
+               for _, w := range b.Values {
+                       if w.Op == OpStoreWB || w.Op == OpMoveWB || w.Op == OpMoveWBVolatile || w.Op == OpZeroWB {
+                               goto again
                        }
                }
        }