]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: enable carry chain scheduling for arm64
authoreric fang <eric.fang@arm.com>
Thu, 18 Aug 2022 09:59:35 +0000 (09:59 +0000)
committerEric Fang <eric.fang@arm.com>
Tue, 20 Sep 2022 01:15:17 +0000 (01:15 +0000)
This is a follow up of CL 393656 on arm64.

Benchmarks:
name                                  old time/op    new time/op    delta
ScalarMult/P256-8                       42.0µs ± 0%    42.0µs ± 0%   -0.13%  (p=0.032 n=5+5)
ScalarMult/P224-8                        135µs ± 0%      96µs ± 0%  -29.04%  (p=0.008 n=5+5)
ScalarMult/P384-8                        573µs ± 1%     355µs ± 0%  -38.05%  (p=0.008 n=5+5)
ScalarMult/P521-8                       1.50ms ± 4%    0.77ms ± 0%  -48.78%  (p=0.008 n=5+5)
MarshalUnmarshal/P256/Uncompressed-8     505ns ± 1%     506ns ± 0%     ~     (p=0.460 n=5+5)
MarshalUnmarshal/P256/Compressed-8      6.75µs ± 0%    6.73µs ± 0%   -0.27%  (p=0.016 n=5+5)
MarshalUnmarshal/P224/Uncompressed-8     927ns ± 0%     818ns ± 0%  -11.76%  (p=0.008 n=5+5)
MarshalUnmarshal/P224/Compressed-8       136µs ± 0%      96µs ± 0%  -29.58%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Uncompressed-8    1.77µs ± 0%    1.36µs ± 1%  -23.14%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Compressed-8      56.5µs ± 0%    31.9µs ± 0%  -43.59%  (p=0.016 n=5+4)
MarshalUnmarshal/P521/Uncompressed-8    2.91µs ± 0%    2.03µs ± 1%  -30.32%  (p=0.008 n=5+5)
MarshalUnmarshal/P521/Compressed-8       148µs ± 0%      68µs ± 1%  -54.28%  (p=0.008 n=5+5)

Change-Id: I33170360eb8279b998e3c559f7136717fe32e07d
Reviewed-on: https://go-review.googlesource.com/c/go/+/424907
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Eric Fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/ssa/schedule.go
src/cmd/compile/internal/ssa/schedule_test.go

index ebf84d59b38ec6935444a7e5441a09e7de1f186c..092ce7a815623b59173c19a05ae69f1c9d7b609c 100644 (file)
@@ -155,7 +155,7 @@ func schedule(f *Func) {
                                // VARDEF ops are scheduled before the corresponding LEA.
                                score[v.ID] = ScoreMemory
                        case v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN:
-                               if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].Op.isCarry() || v.Type.IsFlags()) {
+                               if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].isCarry() || v.Type.IsFlags()) {
                                        // When the Select pseudo op is being used for a carry or flag from
                                        // a tuple then score it as ScoreFlags so it happens later. This
                                        // prevents the bit from being clobbered before it is used.
@@ -163,8 +163,8 @@ func schedule(f *Func) {
                                } else {
                                        score[v.ID] = ScoreReadTuple
                                }
-                       case v.Op.isCarry():
-                               if w := v.getCarryProducer(); w != nil {
+                       case v.isCarry():
+                               if w := v.getCarryInput(); w != nil && w.Block == b {
                                        // The producing op is not the final user of the carry bit. Its
                                        // current score is one of unscored, Flags, or CarryChainTail.
                                        // These occur if the producer has not been scored, another user
@@ -183,7 +183,7 @@ func schedule(f *Func) {
                                        // one chain to be scheduled, if possible.
                                        score[v.ID] = ScoreCarryChainTail
                                }
-                       case v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags():
+                       case v.isFlagOp():
                                // Schedule flag register generation as late as possible.
                                // This makes sure that we only have one live flags
                                // value at a time.
@@ -192,7 +192,7 @@ func schedule(f *Func) {
                                score[v.ID] = ScoreDefault
                                // If we're reading flags, schedule earlier to keep flag lifetime short.
                                for _, a := range v.Args {
-                                       if a.Type.IsFlags() {
+                                       if a.isFlagOp() {
                                                score[v.ID] = ScoreReadFlags
                                        }
                                }
@@ -263,7 +263,6 @@ func schedule(f *Func) {
                                        }
                                }
                        }
-
                }
 
                // To put things into a priority queue
@@ -287,7 +286,7 @@ func schedule(f *Func) {
 
                        v := heap.Pop(priq).(*Value)
 
-                       if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.Op.isCarry() {
+                       if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.isCarry() {
                                // Add some debugging noise if the chain of carrying ops will not
                                // likely be scheduled without potential carry flag clobbers.
                                if !isCarryChainReady(v, uses) {
@@ -551,39 +550,66 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value
        return order
 }
 
-// Return whether all dependent carry ops can be scheduled after this.
+// isFlagOp reports if v is an OP with the flag type.
+func (v *Value) isFlagOp() bool {
+       return v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags()
+}
+
+// isCarryChainReady reports whether all dependent carry ops can be scheduled after this.
 func isCarryChainReady(v *Value, uses []int32) bool {
        // A chain can be scheduled in it's entirety if
        // the use count of each dependent op is 1. If none,
        // schedule the first.
        j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1.
-       for k := v; k != nil; k = k.getCarryProducer() {
+       for k := v; k != nil; k = k.getCarryInput() {
                j += int(uses[k.ID]) - 1
        }
        return j == 0
 }
 
-// Return whether op is an operation which produces a carry bit value, but does not consume it.
-func (op Op) isCarryCreator() bool {
-       switch op {
+// isCarryInput reports whether v accepts a carry value as input.
+func (v *Value) isCarryInput() bool {
+       return v.getCarryInput() != nil
+}
+
+// isCarryOutput reports whether v generates a carry as output.
+func (v *Value) isCarryOutput() bool {
+       if v.isFlagOp() && v.Op != OpSelect1 {
+               return true
+       }
+       // special cases for PPC64 which put their carry values in XER instead of flags
+       switch v.Op {
        case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
                return true
        }
        return false
 }
 
-// Return whether op consumes or creates a carry a bit value.
-func (op Op) isCarry() bool {
-       switch op {
-       case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
-               return true
-       }
-       return op.isCarryCreator()
+// isCarryCreator reports whether op is an operation which produces a carry bit value,
+// but does not consume it.
+func (v *Value) isCarryCreator() bool {
+       return v.isCarryOutput() && !v.isCarryInput()
 }
 
-// Return the producing *Value of the carry bit of this op, or nil if none.
-func (v *Value) getCarryProducer() *Value {
-       if v.Op.isCarry() && !v.Op.isCarryCreator() {
+// isCarry reports whether op consumes or creates a carry a bit value.
+func (v *Value) isCarry() bool {
+       return v.isCarryOutput() || v.isCarryInput()
+}
+
+// getCarryProducer returns the producing *Value of the carry bit of this op, or nil if none.
+func (v *Value) getCarryInput() *Value {
+       for _, a := range v.Args {
+               if !a.isFlagOp() {
+                       continue
+               }
+               if a.Op == OpSelect1 {
+                       a = a.Args[0]
+               }
+               return a
+       }
+       // special cases for PPC64 which put their carry values in XER instead of flags
+       switch v.Op {
+       case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
                // PPC64 carry dependencies are conveyed through their final argument.
                // Likewise, there is always an OpSelect1 between them.
                return v.Args[len(v.Args)-1].Args[0]
index f7177dd704553851a274def5c5a4071495abd740..6cf5105be1f44ef9d77ed05376be17769893c484 100644 (file)
@@ -99,3 +99,62 @@ func TestStoreOrder(t *testing.T) {
                t.Errorf("store order is wrong: got %v, want v2 v3 v4 after v5", order)
        }
 }
+
+func TestCarryChainOrder(t *testing.T) {
+       // In the function below, there are two carry chains that have no dependencies on each other,
+       // one is A1 -> A1carry -> A1Carryvalue, the other is A2 -> A2carry -> A2Carryvalue. If they
+       // are not scheduled properly, the carry will be clobbered, causing the carry to be regenerated.
+       c := testConfigARM64(t)
+       fun := c.Fun("entry",
+               Bloc("entry",
+                       Valu("mem0", OpInitMem, types.TypeMem, 0, nil),
+                       Valu("x", OpARM64MOVDconst, c.config.Types.UInt64, 5, nil),
+                       Valu("y", OpARM64MOVDconst, c.config.Types.UInt64, 6, nil),
+                       Valu("z", OpARM64MOVDconst, c.config.Types.UInt64, 7, nil),
+                       Valu("A1", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "x", "z"), // x+z, set flags
+                       Valu("A1carry", OpSelect1, types.TypeFlags, 0, nil, "A1"),
+                       Valu("A2", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "y", "z"), // y+z, set flags
+                       Valu("A2carry", OpSelect1, types.TypeFlags, 0, nil, "A2"),
+                       Valu("A1value", OpSelect0, c.config.Types.UInt64, 0, nil, "A1"),
+                       Valu("A1Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A1carry"), // 0+0+A1carry
+                       Valu("A2value", OpSelect0, c.config.Types.UInt64, 0, nil, "A2"),
+                       Valu("A2Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A2carry"), // 0+0+A2carry
+                       Valu("ValueSum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1value", "A2value"),
+                       Valu("CarrySum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1Carryvalue", "A2Carryvalue"),
+                       Valu("Sum", OpARM64AND, c.config.Types.UInt64, 0, nil, "ValueSum", "CarrySum"),
+                       Goto("exit")),
+               Bloc("exit",
+                       Exit("mem0")),
+       )
+
+       CheckFunc(fun.f)
+       schedule(fun.f)
+
+       // The expected order is A1 < A1carry < A1Carryvalue < A2 < A2carry < A2Carryvalue.
+       // There is no dependency between the two carry chains, so it doesn't matter which
+       // comes first and which comes after, but the unsorted position of A1 is before A2,
+       // so A1Carryvalue < A2.
+       var ai, bi, ci, di, ei, fi int
+       for i, v := range fun.f.Blocks[0].Values {
+               switch {
+               case fun.values["A1"] == v:
+                       ai = i
+               case fun.values["A1carry"] == v:
+                       bi = i
+               case fun.values["A1Carryvalue"] == v:
+                       ci = i
+               case fun.values["A2"] == v:
+                       di = i
+               case fun.values["A2carry"] == v:
+                       ei = i
+               case fun.values["A2Carryvalue"] == v:
+                       fi = i
+               }
+       }
+       if !(ai < bi && bi < ci && ci < di && di < ei && ei < fi) {
+               t.Logf("Func: %s", fun.f)
+               t.Errorf("carry chain order is wrong: got %v, want V%d after V%d after V%d after V%d after V%d after V%d,",
+                       fun.f.Blocks[0], fun.values["A1"].ID, fun.values["A1carry"].ID, fun.values["A1Carryvalue"].ID,
+                       fun.values["A2"].ID, fun.values["A2carry"].ID, fun.values["A2Carryvalue"].ID)
+       }
+}