]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: memory clearing optimization for arm64
authorWei Xiao <wei.xiao@arm.com>
Thu, 27 Jul 2017 01:55:03 +0000 (01:55 +0000)
committerCherry Zhang <cherryyz@google.com>
Fri, 25 Aug 2017 20:09:06 +0000 (20:09 +0000)
Use "STP (ZR, ZR), O(R)" instead of "MOVD ZR, O(R)" to implement memory clearing.
Also improve assembler supports to STP/LDP.
Results (A57@2GHzx8):

benchmark                   old ns/op     new ns/op     delta
BenchmarkClearFat8-8        1.00          1.00          +0.00%
BenchmarkClearFat12-8       1.01          1.01          +0.00%
BenchmarkClearFat16-8       1.01          1.01          +0.00%
BenchmarkClearFat24-8       1.52          1.52          +0.00%
BenchmarkClearFat32-8       3.00          2.02          -32.67%
BenchmarkClearFat40-8       3.50          2.52          -28.00%
BenchmarkClearFat48-8       3.50          3.03          -13.43%
BenchmarkClearFat56-8       4.00          3.50          -12.50%
BenchmarkClearFat64-8       4.25          4.00          -5.88%
BenchmarkClearFat128-8      8.01          8.01          +0.00%
BenchmarkClearFat256-8      16.1          16.0          -0.62%
BenchmarkClearFat512-8      32.1          32.0          -0.31%
BenchmarkClearFat1024-8     64.1          64.1          +0.00%

Change-Id: Ie5f5eac271ff685884775005825f206167a5c146
Reviewed-on: https://go-review.googlesource.com/55610
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
src/cmd/compile/internal/arm64/ggen.go
src/cmd/compile/internal/arm64/ssa.go
src/cmd/compile/internal/ssa/gen/ARM64.rules
src/cmd/compile/internal/ssa/gen/ARM64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteARM64.go
src/cmd/internal/obj/arm64/a.out.go
src/cmd/internal/obj/arm64/anames7.go
src/cmd/internal/obj/arm64/asm7.go
src/runtime/duff_arm64.s
src/runtime/mkduff.go

index 52a8e3f3e3764bbb8b05065af772ebb61c86e7f2..f7b3851398f51c1578cfc62f5664dfdc814b6ced 100644 (file)
@@ -31,13 +31,18 @@ func zerorange(pp *gc.Progs, p *obj.Prog, off, cnt int64, _ *uint32) *obj.Prog {
                        p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGZERO, 0, obj.TYPE_MEM, arm64.REGSP, 8+off+i)
                }
        } else if cnt <= int64(128*gc.Widthptr) && !darwin { // darwin ld64 cannot handle BR26 reloc with non-zero addend
+               if cnt%(2*int64(gc.Widthptr)) != 0 {
+                       p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGZERO, 0, obj.TYPE_MEM, arm64.REGSP, 8+off)
+                       off += int64(gc.Widthptr)
+                       cnt -= int64(gc.Widthptr)
+               }
                p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGSP, 0, obj.TYPE_REG, arm64.REGRT1, 0)
-               p = pp.Appendpp(p, arm64.AADD, obj.TYPE_CONST, 0, 8+off-8, obj.TYPE_REG, arm64.REGRT1, 0)
+               p = pp.Appendpp(p, arm64.AADD, obj.TYPE_CONST, 0, 8+off, obj.TYPE_REG, arm64.REGRT1, 0)
                p.Reg = arm64.REGRT1
                p = pp.Appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_MEM, 0, 0)
                p.To.Name = obj.NAME_EXTERN
                p.To.Sym = gc.Duffzero
-               p.To.Offset = 4 * (128 - cnt/int64(gc.Widthptr))
+               p.To.Offset = 4 * (64 - cnt/(2*int64(gc.Widthptr)))
        } else {
                p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_CONST, 0, 8+off-8, obj.TYPE_REG, arm64.REGTMP, 0)
                p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGSP, 0, obj.TYPE_REG, arm64.REGRT1, 0)
index 18cd01cd1596e724c62139cc8729ba57ecc8ec08..24a8fb83a4661eb674b257c7baf7b967939ab31d 100644 (file)
@@ -324,6 +324,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.To.Type = obj.TYPE_MEM
                p.To.Reg = v.Args[0].Reg()
                gc.AddAux(&p.To, v)
+       case ssa.OpARM64STP:
+               p := s.Prog(v.Op.Asm())
+               p.From.Type = obj.TYPE_REGREG
+               p.From.Reg = v.Args[1].Reg()
+               p.From.Offset = int64(v.Args[2].Reg())
+               p.To.Type = obj.TYPE_MEM
+               p.To.Reg = v.Args[0].Reg()
+               gc.AddAux(&p.To, v)
        case ssa.OpARM64MOVBstorezero,
                ssa.OpARM64MOVHstorezero,
                ssa.OpARM64MOVWstorezero,
@@ -334,6 +342,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.To.Type = obj.TYPE_MEM
                p.To.Reg = v.Args[0].Reg()
                gc.AddAux(&p.To, v)
+       case ssa.OpARM64MOVQstorezero:
+               p := s.Prog(v.Op.Asm())
+               p.From.Type = obj.TYPE_REGREG
+               p.From.Reg = arm64.REGZERO
+               p.From.Offset = int64(arm64.REGZERO)
+               p.To.Type = obj.TYPE_MEM
+               p.To.Reg = v.Args[0].Reg()
+               gc.AddAux(&p.To, v)
        case ssa.OpARM64LoweredAtomicExchange64,
                ssa.OpARM64LoweredAtomicExchange32:
                // LDAXR        (Rarg0), Rout
@@ -559,30 +575,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg()
        case ssa.OpARM64DUFFZERO:
-               // runtime.duffzero expects start address - 8 in R16
-               p := s.Prog(arm64.ASUB)
-               p.From.Type = obj.TYPE_CONST
-               p.From.Offset = 8
-               p.Reg = v.Args[0].Reg()
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = arm64.REG_R16
-               p = s.Prog(obj.ADUFFZERO)
+               // runtime.duffzero expects start address in R16
+               p := s.Prog(obj.ADUFFZERO)
                p.To.Type = obj.TYPE_MEM
                p.To.Name = obj.NAME_EXTERN
                p.To.Sym = gc.Duffzero
                p.To.Offset = v.AuxInt
        case ssa.OpARM64LoweredZero:
-               // MOVD.P       ZR, 8(R16)
+               // STP.P        (ZR,ZR), 16(R16)
                // CMP  Rarg1, R16
                // BLE  -2(PC)
-               // arg1 is the address of the last element to zero
-               p := s.Prog(arm64.AMOVD)
+               // arg1 is the address of the last 16-byte unit to zero
+               p := s.Prog(arm64.ASTP)
                p.Scond = arm64.C_XPOST
-               p.From.Type = obj.TYPE_REG
+               p.From.Type = obj.TYPE_REGREG
                p.From.Reg = arm64.REGZERO
+               p.From.Offset = int64(arm64.REGZERO)
                p.To.Type = obj.TYPE_MEM
                p.To.Reg = arm64.REG_R16
-               p.To.Offset = 8
+               p.To.Offset = 16
                p2 := s.Prog(arm64.ACMP)
                p2.From.Type = obj.TYPE_REG
                p2.From.Reg = v.Args[1].Reg()
index f13541068ee8b56f89f8ad7fa59d85a5f630d85f..44299af9209f2eee74b0dd4268f387dc2cf354a1 100644 (file)
        (MOVBstore [6] ptr (MOVDconst [0])
                (MOVHstore [4] ptr (MOVDconst [0])
                        (MOVWstore ptr (MOVDconst [0]) mem)))
+(Zero [9] ptr mem) ->
+       (MOVBstore [8] ptr (MOVDconst [0])
+               (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [10] ptr mem) ->
+       (MOVHstore [8] ptr (MOVDconst [0])
+               (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [11] ptr mem) ->
+       (MOVBstore [10] ptr (MOVDconst [0])
+               (MOVHstore [8] ptr (MOVDconst [0])
+                       (MOVDstore ptr (MOVDconst [0]) mem)))
 (Zero [12] ptr mem) ->
        (MOVWstore [8] ptr (MOVDconst [0])
                (MOVDstore ptr (MOVDconst [0]) mem))
-(Zero [16] ptr mem) ->
-       (MOVDstore [8] ptr (MOVDconst [0])
-               (MOVDstore ptr (MOVDconst [0]) mem))
-(Zero [24] ptr mem) ->
-       (MOVDstore [16] ptr (MOVDconst [0])
-               (MOVDstore [8] ptr (MOVDconst [0])
+(Zero [13] ptr mem) ->
+       (MOVBstore [12] ptr (MOVDconst [0])
+               (MOVWstore [8] ptr (MOVDconst [0])
                        (MOVDstore ptr (MOVDconst [0]) mem)))
+(Zero [14] ptr mem) ->
+       (MOVHstore [12] ptr (MOVDconst [0])
+               (MOVWstore [8] ptr (MOVDconst [0])
+                       (MOVDstore ptr (MOVDconst [0]) mem)))
+(Zero [15] ptr mem) ->
+       (MOVBstore [14] ptr (MOVDconst [0])
+               (MOVHstore [12] ptr (MOVDconst [0])
+                       (MOVWstore [8] ptr (MOVDconst [0])
+                               (MOVDstore ptr (MOVDconst [0]) mem))))
+(Zero [16] ptr mem) ->
+       (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
+
+(Zero [32] ptr mem) ->
+       (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+               (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
+
+(Zero [48] ptr mem) ->
+       (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
+               (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+                       (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
+
+(Zero [64] ptr mem) ->
+       (STP [48] ptr (MOVDconst [0]) (MOVDconst [0])
+               (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
+                       (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+                               (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
 
 // strip off fractional word zeroing
-(Zero [s] ptr mem) && s%8 != 0 && s > 8 ->
-       (Zero [s%8]
-               (OffPtr <ptr.Type> ptr [s-s%8])
-               (Zero [s-s%8] ptr mem))
+(Zero [s] ptr mem) && s%16 != 0 && s > 16 ->
+       (Zero [s-s%16]
+               (OffPtr <ptr.Type> ptr [s%16])
+               (Zero [s%16] ptr mem))
 
 // medium zeroing uses a duff device
-// 4, 8, and 128 are magic constants, see runtime/mkduff.go
+// 4, 16, and 64 are magic constants, see runtime/mkduff.go
 (Zero [s] ptr mem)
-       && s%8 == 0 && s > 24 && s <= 8*128
+       && s%16 == 0 && s > 64 && s <= 16*64
        && !config.noDuffDevice ->
-       (DUFFZERO [4 * (128 - int64(s/8))] ptr mem)
+       (DUFFZERO [4 * (64 - int64(s/16))] ptr mem)
 
 // large zeroing uses a loop
 (Zero [s] ptr mem)
-       && s%8 == 0 && (s > 8*128 || config.noDuffDevice) ->
+       && s%16 == 0 && (s > 16*64 || config.noDuffDevice) ->
        (LoweredZero
                ptr
-               (ADDconst <ptr.Type> [s-8] ptr)
+               (ADDconst <ptr.Type> [s-16] ptr)
                mem)
 
 // moves
 (MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(off1+off2)
        && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
        (MOVDstore [off1+off2] {sym} ptr val mem)
+(STP [off1] {sym} (ADDconst [off2] ptr) val1 val2 mem) && is32Bit(off1+off2)
+       && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
+       (STP [off1+off2] {sym} ptr val1 val2 mem)
 (FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(off1+off2)
        && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
        (FMOVSstore [off1+off2] {sym} ptr val mem)
 (MOVDstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(off1+off2)
        && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
        (MOVDstorezero [off1+off2] {sym} ptr mem)
+(MOVQstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(off1+off2)
+       && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
+       (MOVQstorezero [off1+off2] {sym} ptr mem)
 
 (MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
        && canMergeSym(sym1,sym2) && is32Bit(off1+off2)
        && canMergeSym(sym1,sym2) && is32Bit(off1+off2)
        && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
        (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(STP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val1 val2 mem)
+       && canMergeSym(sym1,sym2) && is32Bit(off1+off2)
+       && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
+       (STP [off1+off2] {mergeSym(sym1,sym2)} ptr val1 val2 mem)
 (FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
        && canMergeSym(sym1,sym2) && is32Bit(off1+off2)
        && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
        && canMergeSym(sym1,sym2) && is32Bit(off1+off2)
        && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
        (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVQstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+       && canMergeSym(sym1,sym2) && is32Bit(off1+off2)
+       && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
+       (MOVQstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
 
 // store zero
 (MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVBstorezero [off] {sym} ptr mem)
 (MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVHstorezero [off] {sym} ptr mem)
 (MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVWstorezero [off] {sym} ptr mem)
 (MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVDstorezero [off] {sym} ptr mem)
+(STP [off] {sym} ptr (MOVDconst [0]) (MOVDconst [0]) mem) -> (MOVQstorezero [off] {sym} ptr mem)
 
 // replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
 // these seem to have bad interaction with other rules, resulting in slower code
index 3b3d494c54f660727dd573bc3998778d77a97b17..1cac97f3ae5335736698943c28e4e91eb15b9c56 100644 (file)
@@ -144,6 +144,7 @@ func init() {
                gpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
                gpstore   = regInfo{inputs: []regMask{gpspsbg, gpg}}
                gpstore0  = regInfo{inputs: []regMask{gpspsbg}}
+               gpstore2  = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
                gpxchg    = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
                gpcas     = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
                fp01      = regInfo{inputs: nil, outputs: []regMask{fp}}
@@ -275,13 +276,15 @@ func init() {
                {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},   // store 2 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
                {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},   // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
                {name: "MOVDstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},   // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+               {name: "STP", argLength: 4, reg: gpstore2, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},         // store 16 bytes of arg1 and arg2 to arg0 + auxInt + aux.  arg3=mem.
                {name: "FMOVSstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVS", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
                {name: "FMOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
 
                {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux.  arg1=mem.
                {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
                {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
-               {name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux.  ar12=mem.
+               {name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+               {name: "MOVQstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // store 16 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
 
                // conversions
                {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
@@ -347,7 +350,7 @@ func init() {
                        aux:       "Int64",
                        argLength: 2,
                        reg: regInfo{
-                               inputs:   []regMask{gp},
+                               inputs:   []regMask{buildReg("R16")},
                                clobbers: buildReg("R16 R30"),
                        },
                        faultOnNilArg0: true,
@@ -355,14 +358,14 @@ func init() {
 
                // large zeroing
                // arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
-               // arg1 = address of the last element to zero
+               // arg1 = address of the last 16-byte unit to zero
                // arg2 = mem
                // returns mem
-               //      MOVD.P  ZR, 8(R16)
+               //      STP.P   (ZR,ZR), 16(R16)
                //      CMP     Rarg1, R16
                //      BLE     -2(PC)
                // Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
-               // the-end-of-the-memory - 8 is with the area to zero, ok to spill.
+               // the-end-of-the-memory - 16 is with the area to zero, ok to spill.
                {
                        name:      "LoweredZero",
                        argLength: 3,
index 94302be47472616e03eac1484b1006ca41d69509..6739c86ad2dacdfa69b6d962c73d2c60acf070fa 100644 (file)
@@ -999,12 +999,14 @@ const (
        OpARM64MOVHstore
        OpARM64MOVWstore
        OpARM64MOVDstore
+       OpARM64STP
        OpARM64FMOVSstore
        OpARM64FMOVDstore
        OpARM64MOVBstorezero
        OpARM64MOVHstorezero
        OpARM64MOVWstorezero
        OpARM64MOVDstorezero
+       OpARM64MOVQstorezero
        OpARM64MOVBreg
        OpARM64MOVBUreg
        OpARM64MOVHreg
@@ -12636,6 +12638,21 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:           "STP",
+               auxType:        auxSymOff,
+               argLen:         4,
+               faultOnNilArg0: true,
+               symEffect:      SymWrite,
+               asm:            arm64.ASTP,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+                               {2, 805044223},           // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+                               {0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+                       },
+               },
+       },
        {
                name:           "FMOVSstore",
                auxType:        auxSymOff,
@@ -12716,6 +12733,19 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:           "MOVQstorezero",
+               auxType:        auxSymOff,
+               argLen:         2,
+               faultOnNilArg0: true,
+               symEffect:      SymWrite,
+               asm:            arm64.ASTP,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+                       },
+               },
+       },
        {
                name:   "MOVBreg",
                argLen: 1,
@@ -13227,7 +13257,7 @@ var opcodeTable = [...]opInfo{
                faultOnNilArg0: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+                               {0, 65536}, // R16
                        },
                        clobbers: 536936448, // R16 R30
                },
index f87c5521bb3b9541100b77fc5d939efde4c711ff..4747468897b15a89919f99c1f5569e0ec821275c 100644 (file)
@@ -129,6 +129,8 @@ func rewriteValueARM64(v *Value) bool {
                return rewriteValueARM64_OpARM64MOVHstore_0(v)
        case OpARM64MOVHstorezero:
                return rewriteValueARM64_OpARM64MOVHstorezero_0(v)
+       case OpARM64MOVQstorezero:
+               return rewriteValueARM64_OpARM64MOVQstorezero_0(v)
        case OpARM64MOVWUload:
                return rewriteValueARM64_OpARM64MOVWUload_0(v)
        case OpARM64MOVWUreg:
@@ -173,6 +175,8 @@ func rewriteValueARM64(v *Value) bool {
                return rewriteValueARM64_OpARM64SRL_0(v)
        case OpARM64SRLconst:
                return rewriteValueARM64_OpARM64SRLconst_0(v)
+       case OpARM64STP:
+               return rewriteValueARM64_OpARM64STP_0(v)
        case OpARM64SUB:
                return rewriteValueARM64_OpARM64SUB_0(v)
        case OpARM64SUBconst:
@@ -704,7 +708,7 @@ func rewriteValueARM64(v *Value) bool {
        case OpXor8:
                return rewriteValueARM64_OpXor8_0(v)
        case OpZero:
-               return rewriteValueARM64_OpZero_0(v) || rewriteValueARM64_OpZero_10(v)
+               return rewriteValueARM64_OpZero_0(v) || rewriteValueARM64_OpZero_10(v) || rewriteValueARM64_OpZero_20(v)
        case OpZeroExt16to32:
                return rewriteValueARM64_OpZeroExt16to32_0(v)
        case OpZeroExt16to64:
@@ -4983,6 +4987,62 @@ func rewriteValueARM64_OpARM64MOVHstorezero_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueARM64_OpARM64MOVQstorezero_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
+       // match: (MOVQstorezero [off1] {sym} (ADDconst [off2] ptr) mem)
+       // cond: is32Bit(off1+off2)     && (ptr.Op != OpSB || !config.ctxt.Flag_shared)
+       // result: (MOVQstorezero [off1+off2] {sym} ptr mem)
+       for {
+               off1 := v.AuxInt
+               sym := v.Aux
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64ADDconst {
+                       break
+               }
+               off2 := v_0.AuxInt
+               ptr := v_0.Args[0]
+               mem := v.Args[1]
+               if !(is32Bit(off1+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_shared)) {
+                       break
+               }
+               v.reset(OpARM64MOVQstorezero)
+               v.AuxInt = off1 + off2
+               v.Aux = sym
+               v.AddArg(ptr)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (MOVQstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+       // cond: canMergeSym(sym1,sym2) && is32Bit(off1+off2)   && (ptr.Op != OpSB || !config.ctxt.Flag_shared)
+       // result: (MOVQstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+       for {
+               off1 := v.AuxInt
+               sym1 := v.Aux
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64MOVDaddr {
+                       break
+               }
+               off2 := v_0.AuxInt
+               sym2 := v_0.Aux
+               ptr := v_0.Args[0]
+               mem := v.Args[1]
+               if !(canMergeSym(sym1, sym2) && is32Bit(off1+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_shared)) {
+                       break
+               }
+               v.reset(OpARM64MOVQstorezero)
+               v.AuxInt = off1 + off2
+               v.Aux = mergeSym(sym1, sym2)
+               v.AddArg(ptr)
+               v.AddArg(mem)
+               return true
+       }
+       return false
+}
 func rewriteValueARM64_OpARM64MOVWUload_0(v *Value) bool {
        b := v.Block
        _ = b
@@ -9174,6 +9234,100 @@ func rewriteValueARM64_OpARM64SRLconst_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueARM64_OpARM64STP_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
+       // match: (STP [off1] {sym} (ADDconst [off2] ptr) val1 val2 mem)
+       // cond: is32Bit(off1+off2)     && (ptr.Op != OpSB || !config.ctxt.Flag_shared)
+       // result: (STP [off1+off2] {sym} ptr val1 val2 mem)
+       for {
+               off1 := v.AuxInt
+               sym := v.Aux
+               _ = v.Args[3]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64ADDconst {
+                       break
+               }
+               off2 := v_0.AuxInt
+               ptr := v_0.Args[0]
+               val1 := v.Args[1]
+               val2 := v.Args[2]
+               mem := v.Args[3]
+               if !(is32Bit(off1+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_shared)) {
+                       break
+               }
+               v.reset(OpARM64STP)
+               v.AuxInt = off1 + off2
+               v.Aux = sym
+               v.AddArg(ptr)
+               v.AddArg(val1)
+               v.AddArg(val2)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (STP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val1 val2 mem)
+       // cond: canMergeSym(sym1,sym2) && is32Bit(off1+off2)   && (ptr.Op != OpSB || !config.ctxt.Flag_shared)
+       // result: (STP [off1+off2] {mergeSym(sym1,sym2)} ptr val1 val2 mem)
+       for {
+               off1 := v.AuxInt
+               sym1 := v.Aux
+               _ = v.Args[3]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64MOVDaddr {
+                       break
+               }
+               off2 := v_0.AuxInt
+               sym2 := v_0.Aux
+               ptr := v_0.Args[0]
+               val1 := v.Args[1]
+               val2 := v.Args[2]
+               mem := v.Args[3]
+               if !(canMergeSym(sym1, sym2) && is32Bit(off1+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_shared)) {
+                       break
+               }
+               v.reset(OpARM64STP)
+               v.AuxInt = off1 + off2
+               v.Aux = mergeSym(sym1, sym2)
+               v.AddArg(ptr)
+               v.AddArg(val1)
+               v.AddArg(val2)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (STP [off] {sym} ptr (MOVDconst [0]) (MOVDconst [0]) mem)
+       // cond:
+       // result: (MOVQstorezero [off] {sym} ptr mem)
+       for {
+               off := v.AuxInt
+               sym := v.Aux
+               _ = v.Args[3]
+               ptr := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64MOVDconst {
+                       break
+               }
+               if v_1.AuxInt != 0 {
+                       break
+               }
+               v_2 := v.Args[2]
+               if v_2.Op != OpARM64MOVDconst {
+                       break
+               }
+               if v_2.AuxInt != 0 {
+                       break
+               }
+               mem := v.Args[3]
+               v.reset(OpARM64MOVQstorezero)
+               v.AuxInt = off
+               v.Aux = sym
+               v.AddArg(ptr)
+               v.AddArg(mem)
+               return true
+       }
+       return false
+}
 func rewriteValueARM64_OpARM64SUB_0(v *Value) bool {
        b := v.Block
        _ = b
@@ -16225,17 +16379,17 @@ func rewriteValueARM64_OpZero_0(v *Value) bool {
                v.AddArg(v1)
                return true
        }
-       // match: (Zero [12] ptr mem)
+       // match: (Zero [9] ptr mem)
        // cond:
-       // result: (MOVWstore [8] ptr (MOVDconst [0])           (MOVDstore ptr (MOVDconst [0]) mem))
+       // result: (MOVBstore [8] ptr (MOVDconst [0])           (MOVDstore ptr (MOVDconst [0]) mem))
        for {
-               if v.AuxInt != 12 {
+               if v.AuxInt != 9 {
                        break
                }
                _ = v.Args[1]
                ptr := v.Args[0]
                mem := v.Args[1]
-               v.reset(OpARM64MOVWstore)
+               v.reset(OpARM64MOVBstore)
                v.AuxInt = 8
                v.AddArg(ptr)
                v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
@@ -16255,21 +16409,19 @@ func rewriteValueARM64_OpZero_0(v *Value) bool {
 func rewriteValueARM64_OpZero_10(v *Value) bool {
        b := v.Block
        _ = b
-       config := b.Func.Config
-       _ = config
        typ := &b.Func.Config.Types
        _ = typ
-       // match: (Zero [16] ptr mem)
+       // match: (Zero [10] ptr mem)
        // cond:
-       // result: (MOVDstore [8] ptr (MOVDconst [0])           (MOVDstore ptr (MOVDconst [0]) mem))
+       // result: (MOVHstore [8] ptr (MOVDconst [0])           (MOVDstore ptr (MOVDconst [0]) mem))
        for {
-               if v.AuxInt != 16 {
+               if v.AuxInt != 10 {
                        break
                }
                _ = v.Args[1]
                ptr := v.Args[0]
                mem := v.Args[1]
-               v.reset(OpARM64MOVDstore)
+               v.reset(OpARM64MOVHstore)
                v.AuxInt = 8
                v.AddArg(ptr)
                v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
@@ -16284,23 +16436,80 @@ func rewriteValueARM64_OpZero_10(v *Value) bool {
                v.AddArg(v1)
                return true
        }
-       // match: (Zero [24] ptr mem)
+       // match: (Zero [11] ptr mem)
        // cond:
-       // result: (MOVDstore [16] ptr (MOVDconst [0])          (MOVDstore [8] ptr (MOVDconst [0])                      (MOVDstore ptr (MOVDconst [0]) mem)))
+       // result: (MOVBstore [10] ptr (MOVDconst [0])          (MOVHstore [8] ptr (MOVDconst [0])                      (MOVDstore ptr (MOVDconst [0]) mem)))
        for {
-               if v.AuxInt != 24 {
+               if v.AuxInt != 11 {
                        break
                }
                _ = v.Args[1]
                ptr := v.Args[0]
                mem := v.Args[1]
-               v.reset(OpARM64MOVDstore)
-               v.AuxInt = 16
+               v.reset(OpARM64MOVBstore)
+               v.AuxInt = 10
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVHstore, types.TypeMem)
+               v1.AuxInt = 8
+               v1.AddArg(ptr)
+               v2 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v2.AuxInt = 0
+               v1.AddArg(v2)
+               v3 := b.NewValue0(v.Pos, OpARM64MOVDstore, types.TypeMem)
+               v3.AddArg(ptr)
+               v4 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v4.AuxInt = 0
+               v3.AddArg(v4)
+               v3.AddArg(mem)
+               v1.AddArg(v3)
+               v.AddArg(v1)
+               return true
+       }
+       // match: (Zero [12] ptr mem)
+       // cond:
+       // result: (MOVWstore [8] ptr (MOVDconst [0])           (MOVDstore ptr (MOVDconst [0]) mem))
+       for {
+               if v.AuxInt != 12 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64MOVWstore)
+               v.AuxInt = 8
                v.AddArg(ptr)
                v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
                v0.AuxInt = 0
                v.AddArg(v0)
                v1 := b.NewValue0(v.Pos, OpARM64MOVDstore, types.TypeMem)
+               v1.AddArg(ptr)
+               v2 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v2.AuxInt = 0
+               v1.AddArg(v2)
+               v1.AddArg(mem)
+               v.AddArg(v1)
+               return true
+       }
+       // match: (Zero [13] ptr mem)
+       // cond:
+       // result: (MOVBstore [12] ptr (MOVDconst [0])          (MOVWstore [8] ptr (MOVDconst [0])                      (MOVDstore ptr (MOVDconst [0]) mem)))
+       for {
+               if v.AuxInt != 13 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64MOVBstore)
+               v.AuxInt = 12
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVWstore, types.TypeMem)
                v1.AuxInt = 8
                v1.AddArg(ptr)
                v2 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
@@ -16316,62 +16525,288 @@ func rewriteValueARM64_OpZero_10(v *Value) bool {
                v.AddArg(v1)
                return true
        }
+       // match: (Zero [14] ptr mem)
+       // cond:
+       // result: (MOVHstore [12] ptr (MOVDconst [0])          (MOVWstore [8] ptr (MOVDconst [0])                      (MOVDstore ptr (MOVDconst [0]) mem)))
+       for {
+               if v.AuxInt != 14 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64MOVHstore)
+               v.AuxInt = 12
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVWstore, types.TypeMem)
+               v1.AuxInt = 8
+               v1.AddArg(ptr)
+               v2 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v2.AuxInt = 0
+               v1.AddArg(v2)
+               v3 := b.NewValue0(v.Pos, OpARM64MOVDstore, types.TypeMem)
+               v3.AddArg(ptr)
+               v4 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v4.AuxInt = 0
+               v3.AddArg(v4)
+               v3.AddArg(mem)
+               v1.AddArg(v3)
+               v.AddArg(v1)
+               return true
+       }
+       // match: (Zero [15] ptr mem)
+       // cond:
+       // result: (MOVBstore [14] ptr (MOVDconst [0])          (MOVHstore [12] ptr (MOVDconst [0])                     (MOVWstore [8] ptr (MOVDconst [0])                              (MOVDstore ptr (MOVDconst [0]) mem))))
+       for {
+               if v.AuxInt != 15 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64MOVBstore)
+               v.AuxInt = 14
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVHstore, types.TypeMem)
+               v1.AuxInt = 12
+               v1.AddArg(ptr)
+               v2 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v2.AuxInt = 0
+               v1.AddArg(v2)
+               v3 := b.NewValue0(v.Pos, OpARM64MOVWstore, types.TypeMem)
+               v3.AuxInt = 8
+               v3.AddArg(ptr)
+               v4 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v4.AuxInt = 0
+               v3.AddArg(v4)
+               v5 := b.NewValue0(v.Pos, OpARM64MOVDstore, types.TypeMem)
+               v5.AddArg(ptr)
+               v6 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v6.AuxInt = 0
+               v5.AddArg(v6)
+               v5.AddArg(mem)
+               v3.AddArg(v5)
+               v1.AddArg(v3)
+               v.AddArg(v1)
+               return true
+       }
+       // match: (Zero [16] ptr mem)
+       // cond:
+       // result: (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
+       for {
+               if v.AuxInt != 16 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64STP)
+               v.AuxInt = 0
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v1.AuxInt = 0
+               v.AddArg(v1)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (Zero [32] ptr mem)
+       // cond:
+       // result: (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])                (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
+       for {
+               if v.AuxInt != 32 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64STP)
+               v.AuxInt = 16
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v1.AuxInt = 0
+               v.AddArg(v1)
+               v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
+               v2.AuxInt = 0
+               v2.AddArg(ptr)
+               v3 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v3.AuxInt = 0
+               v2.AddArg(v3)
+               v4 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v4.AuxInt = 0
+               v2.AddArg(v4)
+               v2.AddArg(mem)
+               v.AddArg(v2)
+               return true
+       }
+       // match: (Zero [48] ptr mem)
+       // cond:
+       // result: (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])                (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])                   (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
+       for {
+               if v.AuxInt != 48 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64STP)
+               v.AuxInt = 32
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v1.AuxInt = 0
+               v.AddArg(v1)
+               v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
+               v2.AuxInt = 16
+               v2.AddArg(ptr)
+               v3 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v3.AuxInt = 0
+               v2.AddArg(v3)
+               v4 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v4.AuxInt = 0
+               v2.AddArg(v4)
+               v5 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
+               v5.AuxInt = 0
+               v5.AddArg(ptr)
+               v6 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v6.AuxInt = 0
+               v5.AddArg(v6)
+               v7 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v7.AuxInt = 0
+               v5.AddArg(v7)
+               v5.AddArg(mem)
+               v2.AddArg(v5)
+               v.AddArg(v2)
+               return true
+       }
+       // match: (Zero [64] ptr mem)
+       // cond:
+       // result: (STP [48] ptr (MOVDconst [0]) (MOVDconst [0])                (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])                   (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])                           (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
+       for {
+               if v.AuxInt != 64 {
+                       break
+               }
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               v.reset(OpARM64STP)
+               v.AuxInt = 48
+               v.AddArg(ptr)
+               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v0.AuxInt = 0
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v1.AuxInt = 0
+               v.AddArg(v1)
+               v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
+               v2.AuxInt = 32
+               v2.AddArg(ptr)
+               v3 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v3.AuxInt = 0
+               v2.AddArg(v3)
+               v4 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v4.AuxInt = 0
+               v2.AddArg(v4)
+               v5 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
+               v5.AuxInt = 16
+               v5.AddArg(ptr)
+               v6 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v6.AuxInt = 0
+               v5.AddArg(v6)
+               v7 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v7.AuxInt = 0
+               v5.AddArg(v7)
+               v8 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
+               v8.AuxInt = 0
+               v8.AddArg(ptr)
+               v9 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v9.AuxInt = 0
+               v8.AddArg(v9)
+               v10 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+               v10.AuxInt = 0
+               v8.AddArg(v10)
+               v8.AddArg(mem)
+               v5.AddArg(v8)
+               v2.AddArg(v5)
+               v.AddArg(v2)
+               return true
+       }
+       return false
+}
+func rewriteValueARM64_OpZero_20(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
        // match: (Zero [s] ptr mem)
-       // cond: s%8 != 0 && s > 8
-       // result: (Zero [s%8]          (OffPtr <ptr.Type> ptr [s-s%8])                 (Zero [s-s%8] ptr mem))
+       // cond: s%16 != 0 && s > 16
+       // result: (Zero [s-s%16]               (OffPtr <ptr.Type> ptr [s%16])          (Zero [s%16] ptr mem))
        for {
                s := v.AuxInt
                _ = v.Args[1]
                ptr := v.Args[0]
                mem := v.Args[1]
-               if !(s%8 != 0 && s > 8) {
+               if !(s%16 != 0 && s > 16) {
                        break
                }
                v.reset(OpZero)
-               v.AuxInt = s % 8
+               v.AuxInt = s - s%16
                v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
-               v0.AuxInt = s - s%8
+               v0.AuxInt = s % 16
                v0.AddArg(ptr)
                v.AddArg(v0)
                v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
-               v1.AuxInt = s - s%8
+               v1.AuxInt = s % 16
                v1.AddArg(ptr)
                v1.AddArg(mem)
                v.AddArg(v1)
                return true
        }
        // match: (Zero [s] ptr mem)
-       // cond: s%8 == 0 && s > 24 && s <= 8*128       && !config.noDuffDevice
-       // result: (DUFFZERO [4 * (128 - int64(s/8))] ptr mem)
+       // cond: s%16 == 0 && s > 64 && s <= 16*64      && !config.noDuffDevice
+       // result: (DUFFZERO [4 * (64 - int64(s/16))] ptr mem)
        for {
                s := v.AuxInt
                _ = v.Args[1]
                ptr := v.Args[0]
                mem := v.Args[1]
-               if !(s%8 == 0 && s > 24 && s <= 8*128 && !config.noDuffDevice) {
+               if !(s%16 == 0 && s > 64 && s <= 16*64 && !config.noDuffDevice) {
                        break
                }
                v.reset(OpARM64DUFFZERO)
-               v.AuxInt = 4 * (128 - int64(s/8))
+               v.AuxInt = 4 * (64 - int64(s/16))
                v.AddArg(ptr)
                v.AddArg(mem)
                return true
        }
        // match: (Zero [s] ptr mem)
-       // cond: s%8 == 0 && (s > 8*128 || config.noDuffDevice)
-       // result: (LoweredZero                 ptr             (ADDconst <ptr.Type> [s-8] ptr)                 mem)
+       // cond: s%16 == 0 && (s > 16*64 || config.noDuffDevice)
+       // result: (LoweredZero                 ptr             (ADDconst <ptr.Type> [s-16] ptr)                mem)
        for {
                s := v.AuxInt
                _ = v.Args[1]
                ptr := v.Args[0]
                mem := v.Args[1]
-               if !(s%8 == 0 && (s > 8*128 || config.noDuffDevice)) {
+               if !(s%16 == 0 && (s > 16*64 || config.noDuffDevice)) {
                        break
                }
                v.reset(OpARM64LoweredZero)
                v.AddArg(ptr)
                v0 := b.NewValue0(v.Pos, OpARM64ADDconst, ptr.Type)
-               v0.AuxInt = s - 8
+               v0.AuxInt = s - 16
                v0.AddArg(ptr)
                v.AddArg(v0)
                v.AddArg(mem)
index 3a3fed5cf5e14e1437620d7b7d595e6906c96963..9f225b6f5de709ea7289dd3befd5bb3614cd29e1 100644 (file)
@@ -291,8 +291,10 @@ const (
 
        C_NPAUTO     // -512 <= x < 0, 0 mod 8
        C_NSAUTO     // -256 <= x < 0
+       C_PSAUTO_8   // 0 to 255, 0 mod 8
        C_PSAUTO     // 0 to 255
-       C_PPAUTO     // 0 to 504, 0 mod 8
+       C_PPAUTO_8   // 0 to 504, 0 mod 8
+       C_PPAUTO     // 0 to 504
        C_UAUTO4K_8  // 0 to 4095, 0 mod 8
        C_UAUTO4K_4  // 0 to 4095, 0 mod 4
        C_UAUTO4K_2  // 0 to 4095, 0 mod 2
@@ -315,7 +317,9 @@ const (
        C_ZOREG  // 0(R)
        C_NPOREG // must mirror NPAUTO, etc
        C_NSOREG
+       C_PSOREG_8
        C_PSOREG
+       C_PPOREG_8
        C_PPOREG
        C_UOREG4K_8
        C_UOREG4K_4
index 24911f657d56e8bfc4bf6e169b0f6484da39bb59..6ad9d58132d9a533409973c5671865a0c494dbbd 100644 (file)
@@ -35,7 +35,9 @@ var cnames7 = []string{
        "LBRA",
        "NPAUTO",
        "NSAUTO",
+       "PSAUTO_8",
        "PSAUTO",
+       "PPAUTO_8",
        "PPAUTO",
        "UAUTO4K_8",
        "UAUTO4K_4",
@@ -57,7 +59,9 @@ var cnames7 = []string{
        "ZOREG",
        "NPOREG",
        "NSOREG",
+       "PSOREG_8",
        "PSOREG",
+       "PPOREG_8",
        "PPOREG",
        "UOREG4K_8",
        "UOREG4K_4",
index 8b976cae31458c32967ef501e7509ae204d2d436..fd6fcb77ea8028673a07a7633fce87f118d61452 100644 (file)
@@ -427,12 +427,57 @@ var optab = []Optab{
        {AFMOVS, C_FREG, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
        {AFMOVD, C_FREG, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
 
-       /* pre/post-indexed load/store register pair
-          (unscaled, signed 10-bit quad-aligned offset) */
-       {ALDP, C_LOREG, C_NONE, C_PAIR, 66, 4, 0, 0, C_XPRE},
-       {ALDP, C_LOREG, C_NONE, C_PAIR, 66, 4, 0, 0, C_XPOST},
-       {ASTP, C_PAIR, C_NONE, C_LOREG, 67, 4, 0, 0, C_XPRE},
-       {ASTP, C_PAIR, C_NONE, C_LOREG, 67, 4, 0, 0, C_XPOST},
+       /* pre/post-indexed/signed-offset load/store register pair
+          (unscaled, signed 10-bit quad-aligned and long offset) */
+       {ALDP, C_NPAUTO, C_NONE, C_PAIR, 66, 4, REGSP, 0, 0},
+       {ALDP, C_NPAUTO, C_NONE, C_PAIR, 66, 4, REGSP, 0, C_XPRE},
+       {ALDP, C_NPAUTO, C_NONE, C_PAIR, 66, 4, REGSP, 0, C_XPOST},
+       {ALDP, C_PPAUTO_8, C_NONE, C_PAIR, 66, 4, REGSP, 0, 0},
+       {ALDP, C_PPAUTO_8, C_NONE, C_PAIR, 66, 4, REGSP, 0, C_XPRE},
+       {ALDP, C_PPAUTO_8, C_NONE, C_PAIR, 66, 4, REGSP, 0, C_XPOST},
+       {ALDP, C_UAUTO4K, C_NONE, C_PAIR, 74, 8, REGSP, 0, 0},
+       {ALDP, C_UAUTO4K, C_NONE, C_PAIR, 74, 8, REGSP, 0, C_XPRE},
+       {ALDP, C_UAUTO4K, C_NONE, C_PAIR, 74, 8, REGSP, 0, C_XPOST},
+       {ALDP, C_LAUTO, C_NONE, C_PAIR, 75, 12, REGSP, LFROM, 0},
+       {ALDP, C_LAUTO, C_NONE, C_PAIR, 75, 12, REGSP, LFROM, C_XPRE},
+       {ALDP, C_LAUTO, C_NONE, C_PAIR, 75, 12, REGSP, LFROM, C_XPOST},
+       {ALDP, C_NPOREG, C_NONE, C_PAIR, 66, 4, 0, 0, 0},
+       {ALDP, C_NPOREG, C_NONE, C_PAIR, 66, 4, 0, 0, C_XPRE},
+       {ALDP, C_NPOREG, C_NONE, C_PAIR, 66, 4, 0, 0, C_XPOST},
+       {ALDP, C_PPOREG_8, C_NONE, C_PAIR, 66, 4, 0, 0, 0},
+       {ALDP, C_PPOREG_8, C_NONE, C_PAIR, 66, 4, 0, 0, C_XPRE},
+       {ALDP, C_PPOREG_8, C_NONE, C_PAIR, 66, 4, 0, 0, C_XPOST},
+       {ALDP, C_UOREG4K, C_NONE, C_PAIR, 74, 8, 0, 0, 0},
+       {ALDP, C_UOREG4K, C_NONE, C_PAIR, 74, 8, 0, 0, C_XPRE},
+       {ALDP, C_UOREG4K, C_NONE, C_PAIR, 74, 8, 0, 0, C_XPOST},
+       {ALDP, C_LOREG, C_NONE, C_PAIR, 75, 12, 0, LFROM, 0},
+       {ALDP, C_LOREG, C_NONE, C_PAIR, 75, 12, 0, LFROM, C_XPRE},
+       {ALDP, C_LOREG, C_NONE, C_PAIR, 75, 12, 0, LFROM, C_XPOST},
+
+       {ASTP, C_PAIR, C_NONE, C_NPAUTO, 67, 4, REGSP, 0, 0},
+       {ASTP, C_PAIR, C_NONE, C_NPAUTO, 67, 4, REGSP, 0, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_NPAUTO, 67, 4, REGSP, 0, C_XPOST},
+       {ASTP, C_PAIR, C_NONE, C_PPAUTO_8, 67, 4, REGSP, 0, 0},
+       {ASTP, C_PAIR, C_NONE, C_PPAUTO_8, 67, 4, REGSP, 0, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_PPAUTO_8, 67, 4, REGSP, 0, C_XPOST},
+       {ASTP, C_PAIR, C_NONE, C_UAUTO4K, 76, 8, REGSP, 0, 0},
+       {ASTP, C_PAIR, C_NONE, C_UAUTO4K, 76, 8, REGSP, 0, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_UAUTO4K, 76, 8, REGSP, 0, C_XPOST},
+       {ASTP, C_PAIR, C_NONE, C_LAUTO, 77, 12, REGSP, LTO, 0},
+       {ASTP, C_PAIR, C_NONE, C_LAUTO, 77, 12, REGSP, LTO, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_LAUTO, 77, 12, REGSP, LTO, C_XPOST},
+       {ASTP, C_PAIR, C_NONE, C_NPOREG, 67, 4, 0, 0, 0},
+       {ASTP, C_PAIR, C_NONE, C_NPOREG, 67, 4, 0, 0, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_NPOREG, 67, 4, 0, 0, C_XPOST},
+       {ASTP, C_PAIR, C_NONE, C_PPOREG_8, 67, 4, 0, 0, 0},
+       {ASTP, C_PAIR, C_NONE, C_PPOREG_8, 67, 4, 0, 0, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_PPOREG_8, 67, 4, 0, 0, C_XPOST},
+       {ASTP, C_PAIR, C_NONE, C_UOREG4K, 76, 8, 0, 0, 0},
+       {ASTP, C_PAIR, C_NONE, C_UOREG4K, 76, 8, 0, 0, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_UOREG4K, 76, 8, 0, 0, C_XPOST},
+       {ASTP, C_PAIR, C_NONE, C_LOREG, 77, 12, 0, LTO, 0},
+       {ASTP, C_PAIR, C_NONE, C_LOREG, 77, 12, 0, LTO, C_XPRE},
+       {ASTP, C_PAIR, C_NONE, C_LOREG, 77, 12, 0, LTO, C_XPOST},
 
        /* special */
        {AMOVD, C_SPR, C_NONE, C_REG, 35, 4, 0, 0, 0},
@@ -761,7 +806,9 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) {
                fallthrough
 
        case C_PSAUTO,
+               C_PSAUTO_8,
                C_PPAUTO,
+               C_PPAUTO_8,
                C_UAUTO4K_8,
                C_UAUTO4K_4,
                C_UAUTO4K_2,
@@ -776,7 +823,9 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) {
                C_NPAUTO,
                C_LAUTO,
                C_PPOREG,
+               C_PPOREG_8,
                C_PSOREG,
+               C_PSOREG_8,
                C_UOREG4K_8,
                C_UOREG4K_4,
                C_UOREG4K_2,
@@ -997,9 +1046,15 @@ func autoclass(l int64) int {
        }
 
        if l <= 255 {
+               if (l & 7) == 0 {
+                       return C_PSAUTO_8
+               }
                return C_PSAUTO
        }
-       if l <= 504 && (l&7) == 0 {
+       if l <= 504 {
+               if (l & 7) == 0 {
+                       return C_PPAUTO_8
+               }
                return C_PPAUTO
        }
        if l <= 4095 {
@@ -1396,32 +1451,42 @@ func cmp(a int, b int) bool {
                        return true
                }
 
+       case C_PSAUTO:
+               if b == C_PSAUTO_8 {
+                       return true
+               }
+
        case C_PPAUTO:
-               if b == C_PSAUTO {
+               if b == C_PSAUTO || b == C_PSAUTO_8 {
+                       return true
+               }
+
+       case C_PPAUTO_8:
+               if b == C_PSAUTO_8 {
                        return true
                }
 
        case C_UAUTO4K:
                switch b {
-               case C_PSAUTO, C_PPAUTO, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8:
+               case C_PSAUTO, C_PSAUTO_8, C_PPAUTO, C_PPAUTO_8, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8:
                        return true
                }
 
        case C_UAUTO8K:
                switch b {
-               case C_PSAUTO, C_PPAUTO, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8:
+               case C_PSAUTO, C_PSAUTO_8, C_PPAUTO, C_PPAUTO_8, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8:
                        return true
                }
 
        case C_UAUTO16K:
                switch b {
-               case C_PSAUTO, C_PPAUTO, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8, C_UAUTO16K_8:
+               case C_PSAUTO, C_PSAUTO_8, C_PPAUTO, C_PPAUTO_8, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8, C_UAUTO16K_8:
                        return true
                }
 
        case C_UAUTO32K:
                switch b {
-               case C_PSAUTO, C_PPAUTO, C_UAUTO4K_8, C_UAUTO8K_8, C_UAUTO16K_8:
+               case C_PSAUTO, C_PSAUTO_8, C_PPAUTO, C_PPAUTO_8, C_UAUTO4K_8, C_UAUTO8K_8, C_UAUTO16K_8:
                        return true
                }
 
@@ -1430,7 +1495,7 @@ func cmp(a int, b int) bool {
 
        case C_LAUTO:
                switch b {
-               case C_PSAUTO, C_PPAUTO,
+               case C_PSAUTO, C_PSAUTO_8, C_PPAUTO, C_PPAUTO_8,
                        C_UAUTO4K, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8,
                        C_UAUTO8K, C_UAUTO8K_4, C_UAUTO8K_8,
                        C_UAUTO16K, C_UAUTO16K_8,
@@ -1440,36 +1505,42 @@ func cmp(a int, b int) bool {
                return cmp(C_NPAUTO, b)
 
        case C_PSOREG:
-               if b == C_ZOREG {
+               if b == C_ZOREG || b == C_PSOREG_8 {
                        return true
                }
 
        case C_PPOREG:
-               if b == C_ZOREG || b == C_PSOREG {
+               switch b {
+               case C_ZOREG, C_PSOREG, C_PSOREG_8, C_PPOREG_8:
+                       return true
+               }
+
+       case C_PPOREG_8:
+               if b == C_ZOREG || b == C_PSOREG_8 {
                        return true
                }
 
        case C_UOREG4K:
                switch b {
-               case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8:
+               case C_ZOREG, C_PSOREG_8, C_PSOREG, C_PPOREG_8, C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8:
                        return true
                }
 
        case C_UOREG8K:
                switch b {
-               case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8, C_UOREG8K_4, C_UOREG8K_8:
+               case C_ZOREG, C_PSOREG_8, C_PSOREG, C_PPOREG_8, C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8, C_UOREG8K_4, C_UOREG8K_8:
                        return true
                }
 
        case C_UOREG16K:
                switch b {
-               case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_4, C_UOREG4K_8, C_UOREG8K_4, C_UOREG8K_8, C_UOREG16K_8:
+               case C_ZOREG, C_PSOREG_8, C_PSOREG, C_PPOREG_8, C_PPOREG, C_UOREG4K_4, C_UOREG4K_8, C_UOREG8K_4, C_UOREG8K_8, C_UOREG16K_8:
                        return true
                }
 
        case C_UOREG32K:
                switch b {
-               case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_8, C_UOREG8K_8, C_UOREG16K_8:
+               case C_ZOREG, C_PSOREG_8, C_PSOREG, C_PPOREG_8, C_PPOREG, C_UOREG4K_8, C_UOREG8K_8, C_UOREG16K_8:
                        return true
                }
 
@@ -1478,7 +1549,7 @@ func cmp(a int, b int) bool {
 
        case C_LOREG:
                switch b {
-               case C_ZOREG, C_PSOREG, C_PPOREG,
+               case C_ZOREG, C_PSOREG_8, C_PSOREG, C_PPOREG_8, C_PPOREG,
                        C_UOREG4K, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8,
                        C_UOREG8K, C_UOREG8K_4, C_UOREG8K_8,
                        C_UOREG16K, C_UOREG16K_8,
@@ -2605,7 +2676,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
                        c.ctxt.Diag("illegal bit position\n%v", p)
                }
                if ((d >> uint(s*16)) >> 16) != 0 {
-                       c.ctxt.Diag("requires uimm16\n%v",p)
+                       c.ctxt.Diag("requires uimm16\n%v", p)
                }
                rt := int(p.To.Reg)
 
@@ -2998,31 +3069,50 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o3 = c.olsr12u(p, int32(c.opldr12(p, p.As)), 0, REGTMP, int(p.To.Reg))
 
        case 66: /* ldp O(R)!, (r1, r2); ldp (R)O!, (r1, r2) */
-               v := int32(p.From.Offset)
+               v := int32(c.regoff(&p.From))
+               r := int(p.From.Reg)
+               if r == obj.REG_NONE {
+                       r = int(o.param)
+               }
+               if r == obj.REG_NONE {
+                       c.ctxt.Diag("invalid ldp source: %v\n", p)
+               }
 
-               if v < -512 || v > 504 {
-                       c.ctxt.Diag("offset out of range\n%v", p)
+               if v < -512 || v > 504 || v%8 != 0 {
+                       c.ctxt.Diag("invalid offset %v\n", p)
                }
                if o.scond == C_XPOST {
                        o1 |= 1 << 23
-               } else {
+               } else if o.scond == C_XPRE {
                        o1 |= 3 << 23
+               } else {
+                       o1 |= 2 << 23
                }
                o1 |= 1 << 22
-               o1 |= uint32(int64(2<<30|5<<27|((uint32(v)/8)&0x7f)<<15) | p.To.Offset<<10 | int64(uint32(p.From.Reg&31)<<5) | int64(p.To.Reg&31))
+               o1 |= uint32(int64(2<<30|5<<27|((uint32(v)/8)&0x7f)<<15) | (p.To.Offset&31)<<10 | int64(uint32(r&31)<<5) | int64(p.To.Reg&31))
 
        case 67: /* stp (r1, r2), O(R)!; stp (r1, r2), (R)O! */
-               v := int32(p.To.Offset)
+               r := int(p.To.Reg)
+               if r == obj.REG_NONE {
+                       r = int(o.param)
+               }
+               if r == obj.REG_NONE {
+                       c.ctxt.Diag("invalid stp destination: %v\n", p)
+               }
 
-               if v < -512 || v > 504 {
-                       c.ctxt.Diag("offset out of range\n%v", p)
+               v := int32(c.regoff(&p.To))
+               if v < -512 || v > 504 || v%8 != 0 {
+                       c.ctxt.Diag("invalid offset %v\n", p)
                }
+
                if o.scond == C_XPOST {
                        o1 |= 1 << 23
-               } else {
+               } else if o.scond == C_XPRE {
                        o1 |= 3 << 23
+               } else {
+                       o1 |= 2 << 23
                }
-               o1 |= uint32(int64(2<<30|5<<27|((uint32(v)/8)&0x7f)<<15) | p.From.Offset<<10 | int64(uint32(p.To.Reg&31)<<5) | int64(p.From.Reg&31))
+               o1 |= uint32(int64(2<<30|5<<27|((uint32(v)/8)&0x7f)<<15) | (p.From.Offset&31)<<10 | int64(uint32(r&31)<<5) | int64(p.From.Reg&31))
 
        case 68: /* movT $vconaddr(SB), reg -> adrp + add + reloc */
                if p.As == AMOVW {
@@ -3072,6 +3162,114 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
                rel.Add = 0
                rel.Type = objabi.R_ARM64_GOTPCREL
 
+       case 74:
+               //      add $O, R, Rtmp
+               //      ldp (Rtmp), (R1, R2)
+               r := int(p.From.Reg)
+               if r == obj.REG_NONE {
+                       r = int(o.param)
+               }
+               if r == obj.REG_NONE {
+                       c.ctxt.Diag("invalid ldp source: %v\n", p)
+               }
+
+               v := int32(c.regoff(&p.From))
+               if v < 0 || v > 4095 {
+                       c.ctxt.Diag("offset out of range%v\n", p)
+               }
+
+               if o.scond == C_XPOST {
+                       o2 |= 1 << 23
+               } else if o.scond == C_XPRE {
+                       o2 |= 3 << 23
+               } else {
+                       o2 |= 2 << 23
+               }
+
+               o1 = c.oaddi(p, int32(c.opirr(p, AADD)), v, r, REGTMP)
+               o2 |= 1 << 22
+               o2 |= uint32(int64(2<<30|5<<27) | (p.To.Offset&31)<<10 | int64(uint32(REGTMP&31)<<5) | int64(p.To.Reg&31))
+
+       case 75:
+               //      mov $L, Rtmp (from constant pool)
+               //      add Rtmp, R, Rtmp
+               //      ldp (Rtmp), (R1, R2)
+               r := int(p.From.Reg)
+               if r == obj.REG_NONE {
+                       r = int(o.param)
+               }
+               if r == obj.REG_NONE {
+                       c.ctxt.Diag("invalid ldp source: %v\n", p)
+               }
+
+               if o.scond == C_XPOST {
+                       o3 |= 1 << 23
+               } else if o.scond == C_XPRE {
+                       o3 |= 3 << 23
+               } else {
+                       o3 |= 2 << 23
+               }
+
+               o1 = c.omovlit(AMOVD, p, &p.From, REGTMP)
+               o2 = c.opxrrr(p, AADD)
+               o2 |= (REGTMP & 31) << 16
+               o2 |= uint32(r&31) << 5
+               o2 |= uint32(REGTMP & 31)
+               o3 |= 1 << 22
+               o3 |= uint32(int64(2<<30|5<<27) | (p.To.Offset&31)<<10 | int64(uint32(REGTMP&31)<<5) | int64(p.To.Reg&31))
+
+       case 76:
+               //      add $O, R, Rtmp
+               //      stp (R1, R2), (Rtmp)
+               r := int(p.To.Reg)
+               if r == obj.REG_NONE {
+                       r = int(o.param)
+               }
+               if r == obj.REG_NONE {
+                       c.ctxt.Diag("invalid stp destination: %v\n", p)
+               }
+
+               v := int32(c.regoff(&p.To))
+               if v < 0 || v > 4095 {
+                       c.ctxt.Diag("offset out of range%v\n", p)
+               }
+               if o.scond == C_XPOST {
+                       o2 |= 1 << 23
+               } else if o.scond == C_XPRE {
+                       o2 |= 3 << 23
+               } else {
+                       o2 |= 2 << 23
+               }
+
+               o1 = c.oaddi(p, int32(c.opirr(p, AADD)), v, r, REGTMP)
+               o2 |= uint32(int64(2<<30|5<<27) | (p.From.Offset&31)<<10 | int64(uint32(REGTMP&31)<<5) | int64(p.From.Reg&31))
+
+       case 77:
+               //      mov $L, Rtmp (from constant pool)
+               //      add Rtmp, R, Rtmp
+               //      stp (R1, R2), (Rtmp)
+               r := int(p.To.Reg)
+               if r == obj.REG_NONE {
+                       r = int(o.param)
+               }
+               if r == obj.REG_NONE {
+                       c.ctxt.Diag("invalid stp destination: %v\n", p)
+               }
+
+               if o.scond == C_XPOST {
+                       o3 |= 1 << 23
+               } else if o.scond == C_XPRE {
+                       o3 |= 3 << 23
+               } else {
+                       o3 |= 2 << 23
+               }
+               o1 = c.omovlit(AMOVD, p, &p.To, REGTMP)
+               o2 = c.opxrrr(p, AADD)
+               o2 |= REGTMP & 31 << 16
+               o2 |= uint32(r&31) << 5
+               o2 |= uint32(REGTMP & 31)
+               o3 |= uint32(int64(2<<30|5<<27) | (p.From.Offset&31)<<10 | int64(uint32(REGTMP&31)<<5) | int64(p.From.Reg&31))
+
        // This is supposed to be something that stops execution.
        // It's not supposed to be reached, ever, but if it is, we'd
        // like to be able to tell how we got there. Assemble as
index 60a0e26cd3881bf682d1a3cd50f27a44d36e59ab..21619ff910e1990dfff6344c180c8346d2670c75 100644 (file)
 #include "textflag.h"
 
 TEXT runtime·duffzero(SB), NOSPLIT, $-8-0
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
-       MOVD.W  ZR, 8(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP.P   (ZR, ZR), 16(R16)
+       STP     (ZR, ZR), (R16)
        RET
 
 TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
index 08dcf50859e64ea4cb9e48b803c53e4ac4b594dc..fb7cbc28fd6b1d2313dcb3e8792969d00b894987 100644 (file)
@@ -151,12 +151,13 @@ func copyARM(w io.Writer) {
 
 func zeroARM64(w io.Writer) {
        // ZR: always zero
-       // R16 (aka REGRT1): ptr to memory to be zeroed - 8
+       // R16 (aka REGRT1): ptr to memory to be zeroed
        // On return, R16 points to the last zeroed dword.
        fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
-       for i := 0; i < 128; i++ {
-               fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")
+       for i := 0; i < 63; i++ {
+               fmt.Fprintln(w, "\tSTP.P\t(ZR, ZR), 16(R16)")
        }
+       fmt.Fprintln(w, "\tSTP\t(ZR, ZR), (R16)")
        fmt.Fprintln(w, "\tRET")
 }