MemmoveKnownSize112-4 632.1Mi ± 1% 1288.5Mi ± 0% +103.85% (p=0.000 n=10)
MemmoveKnownSize128-4 636.1Mi ± 0% 1280.9Mi ± 1% +101.36% (p=0.000 n=10)
MemmoveKnownSize192-4 645.3Mi ± 0% 1306.9Mi ± 1% +102.53% (p=0.000 n=10)
MemmoveKnownSize248-4 650.2Mi ± 2% 1312.5Mi ± 1% +101.87% (p=0.000 n=10)
MemmoveKnownSize256-4 650.7Mi ± 0% 1303.6Mi ± 1% +100.33% (p=0.000 n=10)
MemmoveKnownSize512-4 658.2Mi ± 1% 1293.9Mi ± 0% +96.60% (p=0.000 n=10)
MemmoveKnownSize1024-4 662.1Mi ± 0% 1312.6Mi ± 0% +98.26% (p=0.000 n=10)
Change-Id: I43681ca029880025558b33ddc4295da3947c9b28
Reviewed-on: https://go-review.googlesource.com/c/go/+/700537
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Mark Freeman <markfreeman@google.com>
}
case ssa.OpRISCV64LoweredMove:
- mov, sz := largestMove(v.AuxInt)
+ dst := v.Args[0].Reg()
+ src := v.Args[1].Reg()
+ if dst == src {
+ break
+ }
- // mov (Rarg1), T2
- // mov T2, (Rarg0)
- // ADD $sz, Rarg0
- // ADD $sz, Rarg1
- // BGEU Rarg2, Rarg0, -4(PC)
+ sa := v.AuxValAndOff()
+ n := sa.Val64()
+ mov, sz := largestMove(sa.Off64())
- p := s.Prog(mov)
- p.From.Type = obj.TYPE_MEM
- p.From.Reg = v.Args[1].Reg()
+ var off int64
+ tmp := int16(riscv.REG_X5)
+ for n >= sz {
+ moveOp(s, mov, dst, src, tmp, off)
+ off += sz
+ n -= sz
+ }
+
+ for i := len(fracMovOps) - 1; i >= 0; i-- {
+ tsz := int64(1 << i)
+ if n < tsz {
+ continue
+ }
+ moveOp(s, fracMovOps[i], dst, src, tmp, off)
+ off += tsz
+ n -= tsz
+ }
+
+ case ssa.OpRISCV64LoweredMoveLoop:
+ dst := v.Args[0].Reg()
+ src := v.Args[1].Reg()
+ if dst == src {
+ break
+ }
+
+ sc := v.AuxValAndOff()
+ n := sc.Val64()
+ mov, sz := largestMove(sc.Off64())
+ chunk := 8 * sz
+
+ if n <= 3*chunk {
+ v.Fatalf("MoveLoop too small:%d, expect:%d", n, 3*chunk)
+ }
+ tmp := int16(riscv.REG_X5)
+
+ p := s.Prog(riscv.AADD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = n - n%chunk
+ p.Reg = src
p.To.Type = obj.TYPE_REG
- p.To.Reg = riscv.REG_T2
+ p.To.Reg = riscv.REG_X6
- p2 := s.Prog(mov)
- p2.From.Type = obj.TYPE_REG
- p2.From.Reg = riscv.REG_T2
- p2.To.Type = obj.TYPE_MEM
- p2.To.Reg = v.Args[0].Reg()
-
- p3 := s.Prog(riscv.AADD)
- p3.From.Type = obj.TYPE_CONST
- p3.From.Offset = sz
- p3.To.Type = obj.TYPE_REG
- p3.To.Reg = v.Args[0].Reg()
-
- p4 := s.Prog(riscv.AADD)
- p4.From.Type = obj.TYPE_CONST
- p4.From.Offset = sz
- p4.To.Type = obj.TYPE_REG
- p4.To.Reg = v.Args[1].Reg()
+ for i := int64(0); i < 8; i++ {
+ moveOp(s, mov, dst, src, tmp, sz*i)
+ }
- p5 := s.Prog(riscv.ABGEU)
- p5.To.Type = obj.TYPE_BRANCH
- p5.Reg = v.Args[1].Reg()
- p5.From.Type = obj.TYPE_REG
- p5.From.Reg = v.Args[2].Reg()
- p5.To.SetTarget(p)
+ p1 := s.Prog(riscv.AADD)
+ p1.From.Type = obj.TYPE_CONST
+ p1.From.Offset = chunk
+ p1.To.Type = obj.TYPE_REG
+ p1.To.Reg = src
+
+ p2 := s.Prog(riscv.AADD)
+ p2.From.Type = obj.TYPE_CONST
+ p2.From.Offset = chunk
+ p2.To.Type = obj.TYPE_REG
+ p2.To.Reg = dst
+
+ p3 := s.Prog(riscv.ABNE)
+ p3.From.Reg = riscv.REG_X6
+ p3.From.Type = obj.TYPE_REG
+ p3.Reg = src
+ p3.To.Type = obj.TYPE_BRANCH
+ p3.To.SetTarget(p.Link)
+
+ n %= chunk
+
+ var off int64
+ for n >= sz {
+ moveOp(s, mov, dst, src, tmp, off)
+ off += sz
+ n -= sz
+ }
+
+ for i := len(fracMovOps) - 1; i >= 0; i-- {
+ tsz := int64(1 << i)
+ if n < tsz {
+ continue
+ }
+ moveOp(s, fracMovOps[i], dst, src, tmp, off)
+ off += tsz
+ n -= tsz
+ }
case ssa.OpRISCV64LoweredNilCheck:
// Issue a load which will fault if arg is nil.
p.To.Offset = off
return
}
+
+func moveOp(s *ssagen.State, mov obj.As, dst int16, src int16, tmp int16, off int64) {
+ p := s.Prog(mov)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = src
+ p.From.Offset = off
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = tmp
+
+ p1 := s.Prog(mov)
+ p1.From.Type = obj.TYPE_REG
+ p1.From.Reg = tmp
+ p1.To.Type = obj.TYPE_MEM
+ p1.To.Reg = dst
+ p1.To.Offset = off
+
+ return
+}
(MOVHstore [4] dst (MOVHload [4] src mem)
(MOVHstore [2] dst (MOVHload [2] src mem)
(MOVHstore dst (MOVHload src mem) mem)))
-(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
- (MOVWstore [8] dst (MOVWload [8] src mem)
- (MOVWstore [4] dst (MOVWload [4] src mem)
- (MOVWstore dst (MOVWload src mem) mem)))
-(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
- (MOVDstore [8] dst (MOVDload [8] src mem)
- (MOVDstore dst (MOVDload src mem) mem))
-(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
- (MOVDstore [16] dst (MOVDload [16] src mem)
- (MOVDstore [8] dst (MOVDload [8] src mem)
- (MOVDstore dst (MOVDload src mem) mem)))
-(Move [32] {t} dst src mem) && t.Alignment()%8 == 0 =>
- (MOVDstore [24] dst (MOVDload [24] src mem)
- (MOVDstore [16] dst (MOVDload [16] src mem)
- (MOVDstore [8] dst (MOVDload [8] src mem)
- (MOVDstore dst (MOVDload src mem) mem))))
-
-// Medium 8-aligned move uses a Duff's device
-// 16 and 128 are magic constants, see runtime/mkduff.go
-(Move [s] {t} dst src mem)
- && s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0
+
+// Generic move
+(Move [s] {t} dst src mem) && s > 0 && s <= 3*8*moveSize(t.Alignment(), config)
&& logLargeCopy(v, s) =>
- (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+ (LoweredMove [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
// Generic move uses a loop
-(Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) =>
- (LoweredMove [t.Alignment()]
- dst
- src
- (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src)
- mem)
+(Move [s] {t} dst src mem) && s > 3*8*moveSize(t.Alignment(), config)
+ && logLargeCopy(v, s) =>
+ (LoweredMoveLoop [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
// Boolean ops; 0=false, 1=true
(AndB ...) => (AND ...)
regCtxt := regNamed["X26"]
callerSave := gpMask | fpMask | regNamed["g"]
+ r5toR6 := regNamed["X5"] | regNamed["X6"]
var (
gpstore = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register
},
// general unaligned move
- // arg0 = address of dst memory (in X5, changed as side effect)
- // arg1 = address of src memory (in X6, changed as side effect)
- // arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2)
+ // arg0 = address of dst memory (clobber)
+ // arg1 = address of src memory (clobber)
+ // arg2 = mem
+ // auxint = size and type alignment
+ // returns mem
+ // mov (offset)(Rarg1), TMP
+ // mov TMP, (offset)(Rarg0)
+ {
+ name: "LoweredMove",
+ aux: "SymValAndOff",
+ symEffect: "Write",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{gpMask &^ regNamed["X5"], gpMask &^ regNamed["X5"]},
+ clobbers: regNamed["X5"],
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // general unaligned move
+ // arg0 = address of dst memory (clobber)
+ // arg1 = address of src memory (clobber)
// arg3 = mem
// auxint = alignment
- // clobbers X7 as a tmp register.
// returns mem
- // mov (X6), X7
- // mov X7, (X5)
- // ADD $sz, X5
// ADD $sz, X6
- // BGEU Rarg2, X5, -4(PC)
+ //loop:
+ // mov (Rarg1), X5
+ // mov X5, (Rarg0)
+ // ...rest 7 mov...
+ // ADD $sz, Rarg0
+ // ADD $sz, Rarg1
+ // BNE X6, Rarg1, loop
{
- name: "LoweredMove",
- aux: "Int64",
- argLength: 4,
+ name: "LoweredMoveLoop",
+ aux: "SymValAndOff",
+ argLength: 3,
+ symEffect: "Write",
reg: regInfo{
- inputs: []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]},
- clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"],
+ inputs: []regMask{gpMask &^ r5toR6, gpMask &^ r5toR6},
+ clobbers: r5toR6,
+ clobbersArg0: true,
+ clobbersArg1: true,
},
- typ: "Mem",
faultOnNilArg0: true,
faultOnNilArg1: true,
},
OpRISCV64LoweredZero
OpRISCV64LoweredZeroLoop
OpRISCV64LoweredMove
+ OpRISCV64LoweredMoveLoop
OpRISCV64LoweredAtomicLoad8
OpRISCV64LoweredAtomicLoad32
OpRISCV64LoweredAtomicLoad64
},
{
name: "LoweredMove",
- auxType: auxInt64,
- argLen: 4,
+ auxType: auxSymValAndOff,
+ argLen: 3,
faultOnNilArg0: true,
faultOnNilArg1: true,
+ symEffect: SymWrite,
reg: regInfo{
inputs: []inputInfo{
- {0, 16}, // X5
- {1, 32}, // X6
- {2, 1006632880}, // X5 X6 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+ {0, 1006632928}, // X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+ {1, 1006632928}, // X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
},
- clobbers: 112, // X5 X6 X7
+ clobbers: 16, // X5
+ },
+ },
+ {
+ name: "LoweredMoveLoop",
+ auxType: auxSymValAndOff,
+ argLen: 3,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ symEffect: SymWrite,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1006632896}, // X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+ {1, 1006632896}, // X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+ },
+ clobbers: 48, // X5 X6
+ clobbersArg0: true,
+ clobbersArg1: true,
},
},
{
v.AddArg3(dst, v0, v1)
return true
}
- // match: (Move [12] {t} dst src mem)
- // cond: t.Alignment()%4 == 0
- // result: (MOVWstore [8] dst (MOVWload [8] src mem) (MOVWstore [4] dst (MOVWload [4] src mem) (MOVWstore dst (MOVWload src mem) mem)))
- for {
- if auxIntToInt64(v.AuxInt) != 12 {
- break
- }
- t := auxToType(v.Aux)
- dst := v_0
- src := v_1
- mem := v_2
- if !(t.Alignment()%4 == 0) {
- break
- }
- v.reset(OpRISCV64MOVWstore)
- v.AuxInt = int32ToAuxInt(8)
- v0 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
- v0.AuxInt = int32ToAuxInt(8)
- v0.AddArg2(src, mem)
- v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
- v1.AuxInt = int32ToAuxInt(4)
- v2 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
- v2.AuxInt = int32ToAuxInt(4)
- v2.AddArg2(src, mem)
- v3 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
- v4 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
- v4.AddArg2(src, mem)
- v3.AddArg3(dst, v4, mem)
- v1.AddArg3(dst, v2, v3)
- v.AddArg3(dst, v0, v1)
- return true
- }
- // match: (Move [16] {t} dst src mem)
- // cond: t.Alignment()%8 == 0
- // result: (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem))
- for {
- if auxIntToInt64(v.AuxInt) != 16 {
- break
- }
- t := auxToType(v.Aux)
- dst := v_0
- src := v_1
- mem := v_2
- if !(t.Alignment()%8 == 0) {
- break
- }
- v.reset(OpRISCV64MOVDstore)
- v.AuxInt = int32ToAuxInt(8)
- v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v0.AuxInt = int32ToAuxInt(8)
- v0.AddArg2(src, mem)
- v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
- v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v2.AddArg2(src, mem)
- v1.AddArg3(dst, v2, mem)
- v.AddArg3(dst, v0, v1)
- return true
- }
- // match: (Move [24] {t} dst src mem)
- // cond: t.Alignment()%8 == 0
- // result: (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem)))
- for {
- if auxIntToInt64(v.AuxInt) != 24 {
- break
- }
- t := auxToType(v.Aux)
- dst := v_0
- src := v_1
- mem := v_2
- if !(t.Alignment()%8 == 0) {
- break
- }
- v.reset(OpRISCV64MOVDstore)
- v.AuxInt = int32ToAuxInt(16)
- v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v0.AuxInt = int32ToAuxInt(16)
- v0.AddArg2(src, mem)
- v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
- v1.AuxInt = int32ToAuxInt(8)
- v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v2.AuxInt = int32ToAuxInt(8)
- v2.AddArg2(src, mem)
- v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
- v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v4.AddArg2(src, mem)
- v3.AddArg3(dst, v4, mem)
- v1.AddArg3(dst, v2, v3)
- v.AddArg3(dst, v0, v1)
- return true
- }
- // match: (Move [32] {t} dst src mem)
- // cond: t.Alignment()%8 == 0
- // result: (MOVDstore [24] dst (MOVDload [24] src mem) (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem))))
- for {
- if auxIntToInt64(v.AuxInt) != 32 {
- break
- }
- t := auxToType(v.Aux)
- dst := v_0
- src := v_1
- mem := v_2
- if !(t.Alignment()%8 == 0) {
- break
- }
- v.reset(OpRISCV64MOVDstore)
- v.AuxInt = int32ToAuxInt(24)
- v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v0.AuxInt = int32ToAuxInt(24)
- v0.AddArg2(src, mem)
- v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
- v1.AuxInt = int32ToAuxInt(16)
- v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v2.AuxInt = int32ToAuxInt(16)
- v2.AddArg2(src, mem)
- v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
- v3.AuxInt = int32ToAuxInt(8)
- v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v4.AuxInt = int32ToAuxInt(8)
- v4.AddArg2(src, mem)
- v5 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
- v6 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
- v6.AddArg2(src, mem)
- v5.AddArg3(dst, v6, mem)
- v3.AddArg3(dst, v4, v5)
- v1.AddArg3(dst, v2, v3)
- v.AddArg3(dst, v0, v1)
- return true
- }
// match: (Move [s] {t} dst src mem)
- // cond: s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && logLargeCopy(v, s)
- // result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+ // cond: s > 0 && s <= 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)
+ // result: (LoweredMove [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
t := auxToType(v.Aux)
dst := v_0
src := v_1
mem := v_2
- if !(s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && logLargeCopy(v, s)) {
+ if !(s > 0 && s <= 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)) {
break
}
- v.reset(OpRISCV64DUFFCOPY)
- v.AuxInt = int64ToAuxInt(16 * (128 - s/8))
+ v.reset(OpRISCV64LoweredMove)
+ v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment())))
v.AddArg3(dst, src, mem)
return true
}
// match: (Move [s] {t} dst src mem)
- // cond: (s <= 16 || logLargeCopy(v, s))
- // result: (LoweredMove [t.Alignment()] dst src (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src) mem)
+ // cond: s > 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)
+ // result: (LoweredMoveLoop [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
t := auxToType(v.Aux)
dst := v_0
src := v_1
mem := v_2
- if !(s <= 16 || logLargeCopy(v, s)) {
+ if !(s > 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)) {
break
}
- v.reset(OpRISCV64LoweredMove)
- v.AuxInt = int64ToAuxInt(t.Alignment())
- v0 := b.NewValue0(v.Pos, OpRISCV64ADDI, src.Type)
- v0.AuxInt = int64ToAuxInt(s - moveSize(t.Alignment(), config))
- v0.AddArg(src)
- v.AddArg4(dst, src, v0, mem)
+ v.reset(OpRISCV64LoweredMoveLoop)
+ v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment())))
+ v.AddArg3(dst, src, mem)
return true
}
return false