From: Joel Sing Date: Wed, 3 May 2023 11:56:18 +0000 (+1000) Subject: cmd/internal/obj/arm64: avoid unnecessary literal pool usage for moves X-Git-Tag: go1.22rc1~1488 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=208fc13245add90c33cf48571e1419caf80a637c;p=gostls13.git cmd/internal/obj/arm64: avoid unnecessary literal pool usage for moves In a number of load and store cases, the use of the literal pool can be entirely avoided by simply adding or subtracting the offset from the register. This uses the same number of instructions, while avoiding a load from memory, along with the need for the value to be in the literal pool. Overall this reduces the size of binaries slightly and should have lower overhead. Updates #59615 Change-Id: I9cb6a403dc71e34a46af913f5db87dbf52f8688c Reviewed-on: https://go-review.googlesource.com/c/go/+/512539 Reviewed-by: David Chase TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui Run-TryBot: Joel Sing --- diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s index 54f4de76d8..1aa74caa26 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64.s +++ b/src/cmd/asm/internal/asm/testdata/arm64.s @@ -557,7 +557,38 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 FMOVQ 65520(R10), F10 // 4afdff3d FMOVQ 64(RSP), F11 // eb13c03d -// large aligned offset, use two instructions(add+ldr/store). +// medium offsets that either fit a single instruction or can use add+ldr/str + MOVD -4095(R17), R3 // 3bfe3fd1630340f9 + MOVD -391(R17), R3 // 3b1e06d1630340f9 + MOVD -257(R17), R3 // 3b0604d1630340f9 + MOVD -256(R17), R3 // 230250f8 + MOVD 255(R17), R3 // 23f24ff8 + MOVD 256(R17), R3 // 238240f9 + MOVD 257(R17), R3 // 3b060491630340f9 + MOVD 391(R17), R3 // 3b1e0691630340f9 + MOVD 4095(R17), R3 // 3bfe3f91630340f9 + + MOVD R0, -4095(R17) // 3bfe3fd1600300f9 + MOVD R0, -391(R17) // 3b1e06d1600300f9 + MOVD R0, -257(R17) // 3b0604d1600300f9 + MOVD R0, -256(R17) // 200210f8 + MOVD R0, 255(R17) // 20f20ff8 + MOVD R0, 256(R17) // 208200f9 + MOVD R0, 257(R17) // 3b060491600300f9 + MOVD R0, 391(R17) // 3b1e0691600300f9 + MOVD R0, 4095(R17) // 3bfe3f91600300f9 + MOVD R0, 4096(R17) // 200208f9 + MOVD R3, -4095(R17) // 3bfe3fd1630300f9 + MOVD R3, -391(R17) // 3b1e06d1630300f9 + MOVD R3, -257(R17) // 3b0604d1630300f9 + MOVD R3, -256(R17) // 230210f8 + MOVD R3, 255(R17) // 23f20ff8 + MOVD R3, 256(R17) // 238200f9 + MOVD R3, 257(R17) // 3b060491630300f9 + MOVD R3, 391(R17) // 3b1e0691630300f9 + MOVD R3, 4095(R17) // 3bfe3f91630300f9 + +// large aligned offset, use two instructions(add+ldr/str). MOVB R1, 0x1001(R2) // MOVB R1, 4097(R2) // 5b04409161070039 MOVB R1, 0xffffff(R2) // MOVB R1, 16777215(R2) // 5bfc7f9161ff3f39 MOVH R1, 0x2002(R2) // MOVH R1, 8194(R2) // 5b08409161070079 diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 72e6aadce4..05cf62773e 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -1959,6 +1959,10 @@ func (c *ctxt7) loadStoreClass(p *obj.Prog, lsc int, v int64) int { } needsPool := true + if v >= -4095 && v <= 4095 { + needsPool = false + } + switch p.As { case AMOVB, AMOVBU: if cmp(C_UAUTO4K, lsc) || cmp(C_UOREG4K, lsc) { @@ -4015,10 +4019,13 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 |= uint32(p.From.Reg&31)<<5 | uint32(p.To.Reg&31) case 30: /* movT R,L(R) -> strT */ - // if offset L can be split into hi+lo, and both fit into instructions, do + // If offset L fits in a 12 bit unsigned immediate: + // add $L, R, Rtmp or sub $L, R, Rtmp + // str R, (Rtmp) + // Otherwise, if offset L can be split into hi+lo, and both fit into instructions: // add $hi, R, Rtmp // str R, lo(Rtmp) - // otherwise, use constant pool + // Otherwise, use constant pool: // mov $L, Rtmp (from constant pool) // str R, (R+Rtmp) s := movesize(o.as) @@ -4032,6 +4039,20 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { } v := c.regoff(&p.To) + if v >= -256 && v <= 256 { + c.ctxt.Diag("%v: bad type for offset %d (should be 9 bit signed immediate store)", p, v) + } + if v >= 0 && v <= 4095 && v&((1<= -4095 && v <= 4095 { + o1 = c.oaddi12(p, v, REGTMP, int16(r)) + o2 = c.olsr12u(p, c.opstr(p, p.As), 0, REGTMP, p.From.Reg) + break + } + hi, lo, err := splitImm24uScaled(v, s) if err != nil { goto storeusepool @@ -4054,10 +4075,13 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o2 = c.olsxrr(p, int32(c.opstrr(p, p.As, false)), int(p.From.Reg), int(r), REGTMP) case 31: /* movT L(R), R -> ldrT */ - // if offset L can be split into hi+lo, and both fit into instructions, do + // If offset L fits in a 12 bit unsigned immediate: + // add $L, R, Rtmp or sub $L, R, Rtmp + // ldr R, (Rtmp) + // Otherwise, if offset L can be split into hi+lo, and both fit into instructions: // add $hi, R, Rtmp // ldr lo(Rtmp), R - // otherwise, use constant pool + // Otherwise, use constant pool: // mov $L, Rtmp (from constant pool) // ldr (R+Rtmp), R s := movesize(o.as) @@ -4071,6 +4095,20 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { } v := c.regoff(&p.From) + if v >= -256 && v <= 256 { + c.ctxt.Diag("%v: bad type for offset %d (should be 9 bit signed immediate load)", p, v) + } + if v >= 0 && v <= 4095 && v&((1<= -4095 && v <= 4095 { + o1 = c.oaddi12(p, v, REGTMP, int16(r)) + o2 = c.olsr12u(p, c.opldr(p, p.As), 0, REGTMP, p.To.Reg) + break + } + hi, lo, err := splitImm24uScaled(v, s) if err != nil { goto loadusepool