From a96e117a58dce1d55fd83a7b3391fa667dd66652 Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Mon, 23 Jan 2017 08:22:10 -0800
Subject: [PATCH] runtime: amd64, use 4-byte ops for memmove of 4 bytes
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

memmove used to use 2 2-byte load/store pairs to move 4 bytes.
When the result is loaded with a single 4-byte load, it caused
a store to load fowarding stall.  To avoid the stall,
special case memmove to use 4 byte ops for the 4 byte copy case.

We already have a special case for 8-byte copies.
386 already specializes 4-byte copies.
I'll do 2-byte copies also, but not for 1.8.

benchmark                 old ns/op     new ns/op     delta
BenchmarkIssue18740-8     7567          4799          -36.58%

3-byte copies get a bit slower.  Other copies are unchanged.
name         old time/op   new time/op   delta
Memmove/3-8   4.76ns Â± 5%   5.26ns Â± 3%  +10.50%  (p=0.000 n=10+10)

Fixes #18740

Change-Id: Iec82cbac0ecfee80fa3c8fc83828f9a1819c3c74
Reviewed-on: https://go-review.googlesource.com/35567
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
---
 src/runtime/memmove_amd64.s | 10 ++++++++--
 src/runtime/memmove_test.go | 20 ++++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index 464f5fdc1b..c2286d3edd 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -146,10 +146,16 @@ move_1or2:
 move_0:
 	RET
 move_3or4:
+	CMPQ	BX, $4
+	JB	move_3
+	MOVL	(SI), AX
+	MOVL	AX, (DI)
+	RET
+move_3:
 	MOVW	(SI), AX
-	MOVW	-2(SI)(BX*1), CX
+	MOVB	2(SI), CX
 	MOVW	AX, (DI)
-	MOVW	CX, -2(DI)(BX*1)
+	MOVB	CX, 2(DI)
 	RET
 move_5through7:
 	MOVL	(SI), AX
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index dbfa284c28..74b8753b5f 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -6,6 +6,7 @@ package runtime_test
 
 import (
 	"crypto/rand"
+	"encoding/binary"
 	"fmt"
 	"internal/race"
 	. "runtime"
@@ -447,3 +448,22 @@ func BenchmarkCopyFat1024(b *testing.B) {
 		_ = y
 	}
 }
+
+func BenchmarkIssue18740(b *testing.B) {
+	// This tests that memmove uses one 4-byte load/store to move 4 bytes.
+	// It used to do 2 2-byte load/stores, which leads to a pipeline stall
+	// when we try to read the result with one 4-byte load.
+	var buf [4]byte
+	for j := 0; j < b.N; j++ {
+		s := uint32(0)
+		for i := 0; i < 4096; i += 4 {
+			copy(buf[:], g[i:])
+			s += binary.LittleEndian.Uint32(buf[:])
+		}
+		sink = uint64(s)
+	}
+}
+
+// TODO: 2 byte and 8 byte benchmarks also.
+
+var g [4096]byte
-- 
2.50.0