From: Junchen Li Date: Mon, 31 Aug 2020 05:32:33 +0000 (+0800) Subject: cmd/asm: add more SIMD instructions on arm64 X-Git-Tag: go1.16beta1~1087 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=d7ab277eed4d2e5ede4f3361adf42d4ad76ced8f;p=gostls13.git cmd/asm: add more SIMD instructions on arm64 This CL adds USHLL, USHLL2, UZP1, UZP2, and BIF instructions requested by #40725. And since UXTL* are aliases of USHLL*, this CL also merges them into one case. Updates #40725 Change-Id: I404a4fdaf953319f72eea548175bec1097a2a816 Reviewed-on: https://go-review.googlesource.com/c/go/+/253659 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot --- diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s index 451ca749ba..e106ff2ae1 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64.s +++ b/src/cmd/asm/internal/asm/testdata/arm64.s @@ -156,6 +156,26 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 VCMTST V2.B8, V29.B8, V2.B8 // a28f220e VCMTST V2.D2, V23.D2, V3.D2 // e38ee24e VSUB V2.B8, V30.B8, V30.B8 // de87222e + VUZP1 V0.B8, V30.B8, V1.B8 // c11b000e + VUZP1 V1.B16, V29.B16, V2.B16 // a21b014e + VUZP1 V2.H4, V28.H4, V3.H4 // 831b420e + VUZP1 V3.H8, V27.H8, V4.H8 // 641b434e + VUZP1 V28.S2, V2.S2, V5.S2 // 45189c0e + VUZP1 V29.S4, V1.S4, V6.S4 // 26189d4e + VUZP1 V30.D2, V0.D2, V7.D2 // 0718de4e + VUZP2 V0.D2, V30.D2, V1.D2 // c15bc04e + VUZP2 V30.D2, V0.D2, V29.D2 // 1d58de4e + VUSHLL $0, V30.B8, V30.H8 // dea7082f + VUSHLL $0, V30.H4, V29.S4 // dda7102f + VUSHLL $0, V29.S2, V2.D2 // a2a7202f + VUSHLL2 $0, V30.B16, V2.H8 // c2a7086f + VUSHLL2 $0, V30.H8, V30.S4 // dea7106f + VUSHLL2 $0, V29.S4, V2.D2 // a2a7206f + VUSHLL $7, V30.B8, V30.H8 // dea70f2f + VUSHLL $15, V30.H4, V29.S4 // dda71f2f + VUSHLL2 $31, V30.S4, V2.D2 // c2a73f6f + VBIF V0.B8, V30.B8, V1.B8 // c11fe02e + VBIF V30.B16, V0.B16, V2.B16 // 021cfe6e MOVD (R2)(R6.SXTW), R4 // 44c866f8 MOVD (R3)(R6), R5 // MOVD (R3)(R6*1), R5 // 656866f8 MOVD (R2)(R6), R4 // MOVD (R2)(R6*1), R4 // 446866f8 diff --git a/src/cmd/asm/internal/asm/testdata/arm64error.s b/src/cmd/asm/internal/asm/testdata/arm64error.s index 2a911b4cce..20b1f3e9f0 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64error.s +++ b/src/cmd/asm/internal/asm/testdata/arm64error.s @@ -345,4 +345,12 @@ TEXT errors(SB),$0 VUXTL V30.D2, V30.H8 // ERROR "operand mismatch" VUXTL2 V20.B8, V21.H8 // ERROR "operand mismatch" VUXTL V3.D2, V4.B8 // ERROR "operand mismatch" + VUZP1 V0.B8, V30.B8, V1.B16 // ERROR "operand mismatch" + VUZP2 V0.Q1, V30.Q1, V1.Q1 // ERROR "invalid arrangement" + VUSHLL $0, V30.D2, V30.H8 // ERROR "operand mismatch" + VUSHLL2 $0, V20.B8, V21.H8 // ERROR "operand mismatch" + VUSHLL $8, V30.B8, V30.H8 // ERROR "shift amount out of range" + VUSHLL2 $32, V30.S4, V2.D2 // ERROR "shift amount out of range" + VBIF V0.B8, V1.B8, V2.B16 // ERROR "operand mismatch" + VBIF V0.D2, V1.D2, V2.D2 // ERROR "invalid arrangement" RET diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go index ab065e07e5..2839da1437 100644 --- a/src/cmd/internal/obj/arm64/a.out.go +++ b/src/cmd/internal/obj/arm64/a.out.go @@ -954,6 +954,7 @@ const ( AVADD AVADDP AVAND + AVBIF AVCMEQ AVCNT AVEOR @@ -986,6 +987,12 @@ const ( AVEXT AVRBIT AVUSHR + AVUSHLL + AVUSHLL2 + AVUXTL + AVUXTL2 + AVUZP1 + AVUZP2 AVSHL AVSRI AVBSL @@ -994,8 +1001,6 @@ const ( AVZIP1 AVZIP2 AVCMTST - AVUXTL - AVUXTL2 ALAST AB = obj.AJMP ABL = obj.ACALL diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go index 8961f04b0c..48c066abfd 100644 --- a/src/cmd/internal/obj/arm64/anames.go +++ b/src/cmd/internal/obj/arm64/anames.go @@ -461,6 +461,7 @@ var Anames = []string{ "VADD", "VADDP", "VAND", + "VBIF", "VCMEQ", "VCNT", "VEOR", @@ -493,6 +494,12 @@ var Anames = []string{ "VEXT", "VRBIT", "VUSHR", + "VUSHLL", + "VUSHLL2", + "VUXTL", + "VUXTL2", + "VUZP1", + "VUZP2", "VSHL", "VSRI", "VBSL", @@ -501,7 +508,5 @@ var Anames = []string{ "VZIP1", "VZIP2", "VCMTST", - "VUXTL", - "VUXTL2", "LAST", } diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 7ce18d0f13..df4bbbbd35 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -480,6 +480,7 @@ var optab = []Optab{ {AVTBL, C_ARNG, C_NONE, C_LIST, C_ARNG, 100, 4, 0, 0, 0}, {AVUSHR, C_VCON, C_ARNG, C_NONE, C_ARNG, 95, 4, 0, 0, 0}, {AVZIP1, C_ARNG, C_ARNG, C_NONE, C_ARNG, 72, 4, 0, 0, 0}, + {AVUSHLL, C_VCON, C_ARNG, C_NONE, C_ARNG, 102, 4, 0, 0, 0}, {AVUXTL, C_ARNG, C_NONE, C_NONE, C_ARNG, 102, 4, 0, 0, 0}, /* conditional operations */ @@ -2751,6 +2752,9 @@ func buildop(ctxt *obj.Link) { oprangeset(AVBSL, t) oprangeset(AVBIT, t) oprangeset(AVCMTST, t) + oprangeset(AVUZP1, t) + oprangeset(AVUZP2, t) + oprangeset(AVBIF, t) case AVADD: oprangeset(AVSUB, t) @@ -2801,6 +2805,9 @@ func buildop(ctxt *obj.Link) { case AVUXTL: oprangeset(AVUXTL2, t) + case AVUSHLL: + oprangeset(AVUSHLL2, t) + case AVLD1R: oprangeset(AVLD2, t) oprangeset(AVLD2R, t) @@ -4177,7 +4184,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { rel.Add = 0 rel.Type = objabi.R_ARM64_GOTPCREL - case 72: /* vaddp/vand/vcmeq/vorr/vadd/veor/vfmla/vfmls/vbit/vbsl/vcmtst/vsub Vm., Vn., Vd. */ + case 72: /* vaddp/vand/vcmeq/vorr/vadd/veor/vfmla/vfmls/vbit/vbsl/vcmtst/vsub/vbif/vuzip1/vuzip2 Vm., Vn., Vd. */ af := int((p.From.Reg >> 5) & 15) af3 := int((p.Reg >> 5) & 15) at := int((p.To.Reg >> 5) & 15) @@ -4219,7 +4226,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { } switch p.As { - case AVORR, AVAND, AVEOR, AVBIT, AVBSL: + case AVORR, AVAND, AVEOR, AVBIT, AVBSL, AVBIF: if af != ARNG_16B && af != ARNG_8B { c.ctxt.Diag("invalid arrangement: %v", p) } @@ -4233,7 +4240,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { size = 0 case AVBSL: size = 1 - case AVORR, AVBIT: + case AVORR, AVBIT, AVBIF: size = 2 case AVFMLA, AVFMLS: if af == ARNG_2D { @@ -5120,56 +5127,44 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { case 101: // FOMVQ/FMOVD $vcon, Vd -> load from constant pool. o1 = c.omovlit(p.As, p, &p.From, int(p.To.Reg)) - case 102: // VUXTL{2} Vn., Vd. - af := int((p.From.Reg >> 5) & 15) - at := int((p.To.Reg >> 5) & 15) - var Q, immh uint32 - switch at { - case ARNG_8H: - if af == ARNG_8B { - immh = 1 - Q = 0 - } else if af == ARNG_16B { - immh = 1 - Q = 1 - } else { - c.ctxt.Diag("operand mismatch: %v\n", p) - } - case ARNG_4S: - if af == ARNG_4H { - immh = 2 - Q = 0 - } else if af == ARNG_8H { - immh = 2 - Q = 1 - } else { - c.ctxt.Diag("operand mismatch: %v\n", p) - } - case ARNG_2D: - if af == ARNG_2S { - immh = 4 - Q = 0 - } else if af == ARNG_4S { - immh = 4 - Q = 1 - } else { - c.ctxt.Diag("operand mismatch: %v\n", p) - } + case 102: /* vushll, vushll2, vuxtl, vuxtl2 */ + o1 = c.opirr(p, p.As) + rf := p.Reg + af := uint8((p.Reg >> 5) & 15) + at := uint8((p.To.Reg >> 5) & 15) + shift := int(p.From.Offset) + if p.As == AVUXTL || p.As == AVUXTL2 { + rf = p.From.Reg + af = uint8((p.From.Reg >> 5) & 15) + shift = 0 + } + + pack := func(q, x, y uint8) uint32 { + return uint32(q)<<16 | uint32(x)<<8 | uint32(y) + } + + var Q uint8 = uint8(o1>>30) & 1 + var immh, width uint8 + switch pack(Q, af, at) { + case pack(0, ARNG_8B, ARNG_8H): + immh, width = 1, 8 + case pack(1, ARNG_16B, ARNG_8H): + immh, width = 1, 8 + case pack(0, ARNG_4H, ARNG_4S): + immh, width = 2, 16 + case pack(1, ARNG_8H, ARNG_4S): + immh, width = 2, 16 + case pack(0, ARNG_2S, ARNG_2D): + immh, width = 4, 32 + case pack(1, ARNG_4S, ARNG_2D): + immh, width = 4, 32 default: c.ctxt.Diag("operand mismatch: %v\n", p) } - - if p.As == AVUXTL && Q == 1 { - c.ctxt.Diag("operand mismatch: %v\n", p) + if !(0 <= shift && shift <= int(width-1)) { + c.ctxt.Diag("shift amount out of range: %v\n", p) } - if p.As == AVUXTL2 && Q == 0 { - c.ctxt.Diag("operand mismatch: %v\n", p) - } - - o1 = c.oprrr(p, p.As) - rf := int((p.From.Reg) & 31) - rt := int((p.To.Reg) & 31) - o1 |= Q<<30 | immh<<19 | uint32((rf&31)<<5) | uint32(rt&31) + o1 |= uint32(immh)<<19 | uint32(shift)<<16 | uint32(rf&31)<<5 | uint32(p.To.Reg&31) } out[0] = o1 out[1] = o2 @@ -5802,6 +5797,9 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 { case AVLD2R, AVLD4R: return 0xD<<24 | 3<<21 + case AVBIF: + return 1<<29 | 7<<25 | 7<<21 | 7<<10 + case AVBIT: return 1<<29 | 0x75<<21 | 7<<10 @@ -5811,8 +5809,11 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 { case AVCMTST: return 0xE<<24 | 1<<21 | 0x23<<10 - case AVUXTL, AVUXTL2: - return 0x5e<<23 | 0x29<<10 + case AVUZP1: + return 7<<25 | 3<<11 + + case AVUZP2: + return 7<<25 | 1<<14 | 3<<11 } c.ctxt.Diag("%v: bad rrr %d %v", p, a, a) @@ -6011,6 +6012,12 @@ func (c *ctxt7) opirr(p *obj.Prog, a obj.As) uint32 { case AVSRI: return 0x5E<<23 | 17<<10 + + case AVUSHLL, AVUXTL: + return 1<<29 | 15<<24 | 0x29<<10 + + case AVUSHLL2, AVUXTL2: + return 3<<29 | 15<<24 | 0x29<<10 } c.ctxt.Diag("%v: bad irr %v", p, a)