From 8403d4ea9046c31b72602c45e9673c25ac12e7bf Mon Sep 17 00:00:00 2001
From: Meng Zhuo <mengzhuo1203@gmail.com>
Date: Wed, 20 Mar 2019 23:14:37 +0800
Subject: [PATCH] cmd/asm: add V[LD|ST][2-4] vector instructions on arm64

This change adds VLD2, VLD3, VLD4, VST2, VST3, VST4 (multiple structures)
for image or multi media optimazation.

Change-Id: Iae3538ef4434e436e3fb2f19153c58f918f773af
Reviewed-on: https://go-review.googlesource.com/c/go/+/166518
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/cmd/asm/internal/asm/testdata/arm64.s | 18 +++++
 src/cmd/internal/obj/arm64/a.out.go       |  6 ++
 src/cmd/internal/obj/arm64/anames.go      |  6 ++
 src/cmd/internal/obj/arm64/asm7.go        | 81 ++++++++++++++++++++---
 4 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s
index 7fb475fc39..9f19ff1e8d 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -343,6 +343,15 @@ TEXT	foo(SB), DUPOK|NOSPLIT, $-8
 	VST1	[V0.S4, V1.S4], (R0)                            // 00a8004c
 	VLD1	(R30), [V15.S2, V16.S2]                         // cfab400c
 	VLD1.P	24(R30), [V3.S2,V4.S2,V5.S2]                    // c36bdf0c
+	VLD2	(R29), [V23.H8, V24.H8]                         // b787404c
+	VLD2.P	16(R0), [V18.B8, V19.B8]                        // 1280df0c
+	VLD2.P	(R1)(R2), [V15.S2, V16.S2]                      // VLD2.P	(R1)(R2*1), [V15.S2,V16.S2] // 2f88c20c
+	VLD3	(R27), [V11.S4, V12.S4, V13.S4]                 // 6b4b404c
+	VLD3.P	48(RSP), [V11.S4, V12.S4, V13.S4]               // eb4bdf4c
+	VLD3.P	(R30)(R2), [V14.D2, V15.D2, V16.D2]             // VLD3.P	(R30)(R2*1), [V14.D2,V15.D2,V16.D2] // ce4fc24c
+	VLD4	(R15), [V10.H4, V11.H4, V12.H4, V13.H4]         // ea05400c
+	VLD4.P	32(R24), [V31.B8, V0.B8, V1.B8, V2.B8]          // 1f03df0c
+	VLD4.P	(R13)(R9), [V14.S2, V15.S2, V16.S2, V17.S2]     // VLD4.P	(R13)(R9*1), [V14.S2,V15.S2,V16.S2,V17.S2] // ae09c90c
 	VST1.P	[V24.S2], 8(R2)                                 // 58789f0c
 	VST1	[V29.S2, V30.S2], (R29)                         // bdab000c
 	VST1	[V14.H4, V15.H4, V16.H4], (R27)                 // 6e67000c
@@ -352,6 +361,15 @@ TEXT	foo(SB), DUPOK|NOSPLIT, $-8
 	VST1.P	V4.D[1], 8(R0)                                  // 04849f4d
 	VST1.P	V4.D[1], (R0)(R1)                               // VST1.P	V4.D[1], (R0)(R1*1) // 0484814d
 	VST1	V4.D[1], (R0)                                   // 0484004d
+	VST2	[V22.H8, V23.H8], (R23)                         // f686004c
+	VST2.P	[V14.H4, V15.H4], 16(R17)                       // 2e869f0c
+	VST2.P	[V14.H4, V15.H4], (R3)(R17)                     // VST2.P	[V14.H4,V15.H4], (R3)(R17*1) // 6e84910c
+	VST3	[V1.D2, V2.D2, V3.D2], (R11)                    // 614d004c
+	VST3.P	[V18.S4, V19.S4, V20.S4], 48(R25)               // 324b9f4c
+	VST3.P	[V19.B8, V20.B8, V21.B8], (R3)(R7)              // VST3.P	[V19.B8, V20.B8, V21.B8], (R3)(R7*1) // 7340870c
+	VST4	[V22.D2, V23.D2, V24.D2, V25.D2], (R3)          // 760c004c
+	VST4.P	[V14.D2, V15.D2, V16.D2, V17.D2], 64(R15)       // ee0d9f4c
+	VST4.P	[V24.B8, V25.B8, V26.B8, V27.B8], (R3)(R23)     // VST4.P	[V24.B8, V25.B8, V26.B8, V27.B8], (R3)(R23*1) // 7800970c
 	FMOVS	F20, (R0)                                       // 140000bd
 	FMOVS.P	F20, 4(R0)                                      // 144400bc
 	FMOVS.W	F20, 4(R0)                                      // 144c00bc
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go
index 53345b107a..f793cdc4f9 100644
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -953,10 +953,16 @@ const (
 	AVEOR
 	AVMOV
 	AVLD1
+	AVLD2
+	AVLD3
+	AVLD4
 	AVORR
 	AVREV32
 	AVREV64
 	AVST1
+	AVST2
+	AVST3
+	AVST4
 	AVDUP
 	AVADDV
 	AVMOVI
diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go
index 5af6fdc8de..621af6c195 100644
--- a/src/cmd/internal/obj/arm64/anames.go
+++ b/src/cmd/internal/obj/arm64/anames.go
@@ -460,10 +460,16 @@ var Anames = []string{
 	"VEOR",
 	"VMOV",
 	"VLD1",
+	"VLD2",
+	"VLD3",
+	"VLD4",
 	"VORR",
 	"VREV32",
 	"VREV64",
 	"VST1",
+	"VST2",
+	"VST3",
+	"VST4",
 	"VDUP",
 	"VADDV",
 	"VMOVI",
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index 4cae74ca44..88a447bcc8 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -780,16 +780,34 @@ var optab = []Optab{
 	{ASTLXR, C_REG, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0}, // RegTo2=C_REG
 	{ASTXP, C_PAIR, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0},
 
-	/* VLD1/VST1 */
+	/* VLD[1-4]/VST[1-4] */
 	{AVLD1, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
 	{AVLD1, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
 	{AVLD1, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
+	{AVLD2, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
+	{AVLD2, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
+	{AVLD2, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
+	{AVLD3, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
+	{AVLD3, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
+	{AVLD3, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
+	{AVLD4, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
+	{AVLD4, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
+	{AVLD4, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
 	{AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST},
 	{AVLD1, C_ROFF, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST},
 	{AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, 0},
 	{AVST1, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
 	{AVST1, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
 	{AVST1, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
+	{AVST2, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
+	{AVST2, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
+	{AVST2, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
+	{AVST3, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
+	{AVST3, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
+	{AVST3, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
+	{AVST4, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
+	{AVST4, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
+	{AVST4, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
 	{AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, C_XPOST},
 	{AVST1, C_ELEM, C_NONE, C_NONE, C_ROFF, 96, 4, 0, 0, C_XPOST},
 	{AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, 0},
@@ -2695,7 +2713,13 @@ func buildop(ctxt *obj.Link) {
 			AVCNT,
 			AVMOV,
 			AVLD1,
+			AVLD2,
+			AVLD3,
+			AVLD4,
 			AVST1,
+			AVST2,
+			AVST3,
+			AVST4,
 			AVTBL,
 			AVDUP,
 			AVMOVI,
@@ -2775,14 +2799,14 @@ func (c *ctxt7) checkindex(p *obj.Prog, index, maxindex int) {
 	}
 }
 
-/* checkoffset checks whether the immediate offset is valid for VLD1.P and VST1.P */
+/* checkoffset checks whether the immediate offset is valid for VLD[1-4].P and VST[1-4].P */
 func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) {
-	var offset, list, n int64
+	var offset, list, n, expect int64
 	switch as {
-	case AVLD1:
+	case AVLD1, AVLD2, AVLD3, AVLD4:
 		offset = p.From.Offset
 		list = p.To.Offset
-	case AVST1:
+	case AVST1, AVST2, AVST3, AVST4:
 		offset = p.To.Offset
 		list = p.From.Offset
 	default:
@@ -2808,6 +2832,21 @@ func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) {
 	if !(q == 0 && offset == n*8) && !(q == 1 && offset == n*16) {
 		c.ctxt.Diag("invalid post-increment offset: %v", p)
 	}
+
+	switch as {
+	case AVLD1, AVST1:
+		return
+	case AVLD2, AVST2:
+		expect = 2
+	case AVLD3, AVST3:
+		expect = 3
+	case AVLD4, AVST4:
+		expect = 4
+	}
+
+	if expect != n {
+		c.ctxt.Diag("expected %d registers, got %d: %v.", expect, n, p)
+	}
 }
 
 /* checkShiftAmount checks whether the index shift amount is valid */
@@ -4305,14 +4344,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		}
 		o1 |= (uint32(imm5&0x1f) << 16) | (uint32(rf&31) << 5) | uint32(rt&31)
 
-	case 81: /* vld1 (Rn), [Vt1.<T>, Vt2.<T>, ...] */
+	case 81: /* vld[1-4] (Rn), [Vt1.<T>, Vt2.<T>, ...] */
+		c.checkoffset(p, p.As)
 		r := int(p.From.Reg)
 		o1 = 3<<26 | 1<<22
 		if o.scond == C_XPOST {
 			o1 |= 1 << 23
 			if p.From.Index == 0 {
 				// immediate offset variant
-				c.checkoffset(p, p.As)
 				o1 |= 0x1f << 16
 			} else {
 				// register offset variant
@@ -4323,6 +4362,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			}
 		}
 		o1 |= uint32(p.To.Offset)
+		// cmd/asm/internal/arch/arm64.go:ARM64RegisterListOffset
+		// add opcode(bit 12-15) for vld1, mask it off if it's not vld1
+		o1 = c.maskOpvldvst(p, o1)
 		o1 |= uint32(r&31) << 5
 
 	case 82: /* vmov Rn, Vd.<T> */
@@ -4410,14 +4452,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 
 		o1 |= (Q&1)<<30 | (size&3)<<22 | uint32(rf&31)<<5 | uint32(rt&31)
 
-	case 84: /* vst1 [Vt1.<T>, Vt2.<T>, ...], (Rn) */
+	case 84: /* vst[1-4] [Vt1.<T>, Vt2.<T>, ...], (Rn) */
+		c.checkoffset(p, p.As)
 		r := int(p.To.Reg)
 		o1 = 3 << 26
 		if o.scond == C_XPOST {
 			o1 |= 1 << 23
 			if p.To.Index == 0 {
 				// immediate offset variant
-				c.checkoffset(p, p.As)
 				o1 |= 0x1f << 16
 			} else {
 				// register offset variant
@@ -4428,6 +4470,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			}
 		}
 		o1 |= uint32(p.From.Offset)
+		// cmd/asm/internal/arch/arm64.go:ARM64RegisterListOffset
+		// add opcode(bit 12-15) for vst1, mask it off if it's not vst1
+		o1 = c.maskOpvldvst(p, o1)
 		o1 |= uint32(r&31) << 5
 
 	case 85: /* vaddv/vuaddlv Vn.<T>, Vd*/
@@ -6727,6 +6772,24 @@ func (c *ctxt7) opldpstp(p *obj.Prog, o *Optab, vo int32, rbase, rl, rh, ldp uin
 	return ret
 }
 
+func (c *ctxt7) maskOpvldvst(p *obj.Prog, o1 uint32) uint32 {
+	if p.As == AVLD1 || p.As == AVST1 {
+		return o1
+	}
+
+	o1 &^= 0xf000 // mask out "opcode" field (bit 12-15)
+	switch p.As {
+	case AVLD2, AVST2:
+		o1 |= 8 << 12
+	case AVLD3, AVST3:
+		o1 |= 4 << 12
+	case AVLD4, AVST4:
+	default:
+		c.ctxt.Diag("unsupported instruction:%v\n", p.As)
+	}
+	return o1
+}
+
 /*
  * size in log2(bytes)
  */
-- 
2.51.0