]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: enable carry chain scheduling for arm64
authoreric fang <eric.fang@arm.com>
Thu, 18 Aug 2022 09:59:35 +0000 (09:59 +0000)
committerEric Fang <eric.fang@arm.com>
Sat, 8 Oct 2022 01:46:00 +0000 (01:46 +0000)
This is a follow up of CL 393656 on arm64.

This CL puts ScoreCarryChainTail before ScoreMemory and after
ScoreReadFlags, so that the scheduling of the carry chain will not
break the scheduling of ScoreVarDef.

Benchmarks:
name                                  old time/op    new time/op    delta
ScalarMult/P256-8                       42.0µs ± 0%    42.0µs ± 0%   -0.13%  (p=0.032 n=5+5)
ScalarMult/P224-8                        135µs ± 0%      96µs ± 0%  -29.04%  (p=0.008 n=5+5)
ScalarMult/P384-8                        573µs ± 1%     355µs ± 0%  -38.05%  (p=0.008 n=5+5)
ScalarMult/P521-8                       1.50ms ± 4%    0.77ms ± 0%  -48.78%  (p=0.008 n=5+5)
MarshalUnmarshal/P256/Uncompressed-8     505ns ± 1%     506ns ± 0%     ~     (p=0.460 n=5+5)
MarshalUnmarshal/P256/Compressed-8      6.75µs ± 0%    6.73µs ± 0%   -0.27%  (p=0.016 n=5+5)
MarshalUnmarshal/P224/Uncompressed-8     927ns ± 0%     818ns ± 0%  -11.76%  (p=0.008 n=5+5)
MarshalUnmarshal/P224/Compressed-8       136µs ± 0%      96µs ± 0%  -29.58%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Uncompressed-8    1.77µs ± 0%    1.36µs ± 1%  -23.14%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Compressed-8      56.5µs ± 0%    31.9µs ± 0%  -43.59%  (p=0.016 n=5+4)
MarshalUnmarshal/P521/Uncompressed-8    2.91µs ± 0%    2.03µs ± 1%  -30.32%  (p=0.008 n=5+5)
MarshalUnmarshal/P521/Compressed-8       148µs ± 0%      68µs ± 1%  -54.28%  (p=0.008 n=5+5)

Change-Id: I4bf4e3265d7e1ee85765ff2bf006ca5a794d4979
Reviewed-on: https://go-review.googlesource.com/c/go/+/432275
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Eric Fang <eric.fang@arm.com>

src/cmd/compile/internal/ssa/schedule.go
src/cmd/compile/internal/ssa/schedule_test.go

index ebf84d59b38ec6935444a7e5441a09e7de1f186c..62eaa2ed4590b358a1cba8f669a2f5c93a63d3d1 100644 (file)
@@ -14,10 +14,10 @@ const (
        ScorePhi = iota // towards top of block
        ScoreArg
        ScoreNilCheck
-       ScoreCarryChainTail
        ScoreReadTuple
        ScoreVarDef
        ScoreMemory
+       ScoreCarryChainTail
        ScoreReadFlags
        ScoreDefault
        ScoreFlags
@@ -155,7 +155,7 @@ func schedule(f *Func) {
                                // VARDEF ops are scheduled before the corresponding LEA.
                                score[v.ID] = ScoreMemory
                        case v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN:
-                               if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].Op.isCarry() || v.Type.IsFlags()) {
+                               if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].isCarry() || v.Type.IsFlags()) {
                                        // When the Select pseudo op is being used for a carry or flag from
                                        // a tuple then score it as ScoreFlags so it happens later. This
                                        // prevents the bit from being clobbered before it is used.
@@ -163,8 +163,8 @@ func schedule(f *Func) {
                                } else {
                                        score[v.ID] = ScoreReadTuple
                                }
-                       case v.Op.isCarry():
-                               if w := v.getCarryProducer(); w != nil {
+                       case v.isCarry():
+                               if w := v.getCarryInput(); w != nil && w.Block == b {
                                        // The producing op is not the final user of the carry bit. Its
                                        // current score is one of unscored, Flags, or CarryChainTail.
                                        // These occur if the producer has not been scored, another user
@@ -183,7 +183,7 @@ func schedule(f *Func) {
                                        // one chain to be scheduled, if possible.
                                        score[v.ID] = ScoreCarryChainTail
                                }
-                       case v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags():
+                       case v.isFlagOp():
                                // Schedule flag register generation as late as possible.
                                // This makes sure that we only have one live flags
                                // value at a time.
@@ -192,7 +192,7 @@ func schedule(f *Func) {
                                score[v.ID] = ScoreDefault
                                // If we're reading flags, schedule earlier to keep flag lifetime short.
                                for _, a := range v.Args {
-                                       if a.Type.IsFlags() {
+                                       if a.isFlagOp() {
                                                score[v.ID] = ScoreReadFlags
                                        }
                                }
@@ -263,7 +263,6 @@ func schedule(f *Func) {
                                        }
                                }
                        }
-
                }
 
                // To put things into a priority queue
@@ -287,7 +286,7 @@ func schedule(f *Func) {
 
                        v := heap.Pop(priq).(*Value)
 
-                       if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.Op.isCarry() {
+                       if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.isCarry() {
                                // Add some debugging noise if the chain of carrying ops will not
                                // likely be scheduled without potential carry flag clobbers.
                                if !isCarryChainReady(v, uses) {
@@ -551,42 +550,74 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value
        return order
 }
 
-// Return whether all dependent carry ops can be scheduled after this.
+// isFlagOp reports if v is an OP with the flag type.
+func (v *Value) isFlagOp() bool {
+       return v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags()
+}
+
+// isCarryChainReady reports whether all dependent carry ops can be scheduled after this.
 func isCarryChainReady(v *Value, uses []int32) bool {
        // A chain can be scheduled in it's entirety if
        // the use count of each dependent op is 1. If none,
        // schedule the first.
        j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1.
-       for k := v; k != nil; k = k.getCarryProducer() {
+       for k := v; k != nil; k = k.getCarryInput() {
                j += int(uses[k.ID]) - 1
        }
        return j == 0
 }
 
-// Return whether op is an operation which produces a carry bit value, but does not consume it.
-func (op Op) isCarryCreator() bool {
-       switch op {
-       case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
-               return true
-       }
-       return false
+// isCarryInput reports whether v accepts a carry value as input.
+func (v *Value) isCarryInput() bool {
+       return v.getCarryInput() != nil
 }
 
-// Return whether op consumes or creates a carry a bit value.
-func (op Op) isCarry() bool {
-       switch op {
-       case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
-               return true
+// isCarryOutput reports whether v generates a carry as output.
+func (v *Value) isCarryOutput() bool {
+       // special cases for PPC64 which put their carry values in XER instead of flags
+       switch v.Block.Func.Config.arch {
+       case "ppc64", "ppc64le":
+               switch v.Op {
+               case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
+                       return true
+               }
+               return false
        }
-       return op.isCarryCreator()
+       return v.isFlagOp() && v.Op != OpSelect1
+}
+
+// isCarryCreator reports whether op is an operation which produces a carry bit value,
+// but does not consume it.
+func (v *Value) isCarryCreator() bool {
+       return v.isCarryOutput() && !v.isCarryInput()
+}
+
+// isCarry reports whether op consumes or creates a carry a bit value.
+func (v *Value) isCarry() bool {
+       return v.isCarryOutput() || v.isCarryInput()
 }
 
-// Return the producing *Value of the carry bit of this op, or nil if none.
-func (v *Value) getCarryProducer() *Value {
-       if v.Op.isCarry() && !v.Op.isCarryCreator() {
-               // PPC64 carry dependencies are conveyed through their final argument.
-               // Likewise, there is always an OpSelect1 between them.
-               return v.Args[len(v.Args)-1].Args[0]
+// getCarryInput returns the producing *Value of the carry bit of this op, or nil if none.
+func (v *Value) getCarryInput() *Value {
+       // special cases for PPC64 which put their carry values in XER instead of flags
+       switch v.Block.Func.Config.arch {
+       case "ppc64", "ppc64le":
+               switch v.Op {
+               case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
+                       // PPC64 carry dependencies are conveyed through their final argument.
+                       // Likewise, there is always an OpSelect1 between them.
+                       return v.Args[len(v.Args)-1].Args[0]
+               }
+               return nil
+       }
+       for _, a := range v.Args {
+               if !a.isFlagOp() {
+                       continue
+               }
+               if a.Op == OpSelect1 {
+                       a = a.Args[0]
+               }
+               return a
        }
        return nil
 }
index f7177dd704553851a274def5c5a4071495abd740..6cf5105be1f44ef9d77ed05376be17769893c484 100644 (file)
@@ -99,3 +99,62 @@ func TestStoreOrder(t *testing.T) {
                t.Errorf("store order is wrong: got %v, want v2 v3 v4 after v5", order)
        }
 }
+
+func TestCarryChainOrder(t *testing.T) {
+       // In the function below, there are two carry chains that have no dependencies on each other,
+       // one is A1 -> A1carry -> A1Carryvalue, the other is A2 -> A2carry -> A2Carryvalue. If they
+       // are not scheduled properly, the carry will be clobbered, causing the carry to be regenerated.
+       c := testConfigARM64(t)
+       fun := c.Fun("entry",
+               Bloc("entry",
+                       Valu("mem0", OpInitMem, types.TypeMem, 0, nil),
+                       Valu("x", OpARM64MOVDconst, c.config.Types.UInt64, 5, nil),
+                       Valu("y", OpARM64MOVDconst, c.config.Types.UInt64, 6, nil),
+                       Valu("z", OpARM64MOVDconst, c.config.Types.UInt64, 7, nil),
+                       Valu("A1", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "x", "z"), // x+z, set flags
+                       Valu("A1carry", OpSelect1, types.TypeFlags, 0, nil, "A1"),
+                       Valu("A2", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "y", "z"), // y+z, set flags
+                       Valu("A2carry", OpSelect1, types.TypeFlags, 0, nil, "A2"),
+                       Valu("A1value", OpSelect0, c.config.Types.UInt64, 0, nil, "A1"),
+                       Valu("A1Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A1carry"), // 0+0+A1carry
+                       Valu("A2value", OpSelect0, c.config.Types.UInt64, 0, nil, "A2"),
+                       Valu("A2Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A2carry"), // 0+0+A2carry
+                       Valu("ValueSum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1value", "A2value"),
+                       Valu("CarrySum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1Carryvalue", "A2Carryvalue"),
+                       Valu("Sum", OpARM64AND, c.config.Types.UInt64, 0, nil, "ValueSum", "CarrySum"),
+                       Goto("exit")),
+               Bloc("exit",
+                       Exit("mem0")),
+       )
+
+       CheckFunc(fun.f)
+       schedule(fun.f)
+
+       // The expected order is A1 < A1carry < A1Carryvalue < A2 < A2carry < A2Carryvalue.
+       // There is no dependency between the two carry chains, so it doesn't matter which
+       // comes first and which comes after, but the unsorted position of A1 is before A2,
+       // so A1Carryvalue < A2.
+       var ai, bi, ci, di, ei, fi int
+       for i, v := range fun.f.Blocks[0].Values {
+               switch {
+               case fun.values["A1"] == v:
+                       ai = i
+               case fun.values["A1carry"] == v:
+                       bi = i
+               case fun.values["A1Carryvalue"] == v:
+                       ci = i
+               case fun.values["A2"] == v:
+                       di = i
+               case fun.values["A2carry"] == v:
+                       ei = i
+               case fun.values["A2Carryvalue"] == v:
+                       fi = i
+               }
+       }
+       if !(ai < bi && bi < ci && ci < di && di < ei && ei < fi) {
+               t.Logf("Func: %s", fun.f)
+               t.Errorf("carry chain order is wrong: got %v, want V%d after V%d after V%d after V%d after V%d after V%d,",
+                       fun.f.Blocks[0], fun.values["A1"].ID, fun.values["A1carry"].ID, fun.values["A1Carryvalue"].ID,
+                       fun.values["A2"].ID, fun.values["A2carry"].ID, fun.values["A2Carryvalue"].ID)
+       }
+}