From d7507e9d1109da424dd375365dc923257ebd0c23 Mon Sep 17 00:00:00 2001
From: Denis Nagorny <denis.nagorny@intel.com>
Date: Thu, 22 Sep 2016 04:05:39 +0300
Subject: [PATCH]     runtime: improve memmove for amd64
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

    Use AVX if available on 4th generation of Intel(TM) Core(TM) processors.

    (collected on E5 2609v3 @1.9GHz)
    name                        old speed      new speed       delta
    Memmove/1-6                  158MB/s Â± 0%    172MB/s Â± 0%    +9.09% (p=0.000 n=16+16)
    Memmove/2-6                  316MB/s Â± 0%    345MB/s Â± 0%    +9.09% (p=0.000 n=18+16)
    Memmove/3-6                  517MB/s Â± 0%    517MB/s Â± 0%      ~ (p=0.445 n=16+16)
    Memmove/4-6                  687MB/s Â± 1%    690MB/s Â± 0%    +0.35% (p=0.000 n=20+17)
    Memmove/5-6                  729MB/s Â± 0%    729MB/s Â± 0%    +0.01% (p=0.000 n=16+18)
    Memmove/6-6                  875MB/s Â± 0%    875MB/s Â± 0%    +0.01% (p=0.000 n=18+18)
    Memmove/7-6                 1.02GB/s Â± 0%   1.02GB/s Â± 1%      ~ (p=0.139 n=19+20)
    Memmove/8-6                 1.26GB/s Â± 0%   1.26GB/s Â± 0%    +0.00% (p=0.000 n=18+18)
    Memmove/9-6                 1.42GB/s Â± 0%   1.42GB/s Â± 0%    +0.00% (p=0.000 n=17+18)
    Memmove/10-6                1.58GB/s Â± 0%   1.58GB/s Â± 0%    +0.00% (p=0.000 n=19+19)
    Memmove/11-6                1.74GB/s Â± 0%   1.74GB/s Â± 0%    +0.00% (p=0.001 n=18+17)
    Memmove/12-6                1.90GB/s Â± 0%   1.90GB/s Â± 0%    +0.00% (p=0.000 n=19+19)
    Memmove/13-6                2.05GB/s Â± 0%   2.05GB/s Â± 0%    +0.00% (p=0.000 n=18+19)
    Memmove/14-6                2.21GB/s Â± 0%   2.21GB/s Â± 0%    +0.00% (p=0.000 n=16+20)
    Memmove/15-6                2.37GB/s Â± 0%   2.37GB/s Â± 0%    +0.00% (p=0.004 n=19+20)
    Memmove/16-6                2.53GB/s Â± 0%   2.53GB/s Â± 0%    +0.00% (p=0.000 n=16+16)
    Memmove/32-6                4.67GB/s Â± 0%   4.67GB/s Â± 0%    +0.00% (p=0.000 n=17+17)
    Memmove/64-6                8.67GB/s Â± 0%   8.64GB/s Â± 0%    -0.33% (p=0.000 n=18+17)
    Memmove/128-6               12.6GB/s Â± 0%   11.6GB/s Â± 0%    -8.05% (p=0.000 n=16+19)
    Memmove/256-6               16.3GB/s Â± 0%   16.6GB/s Â± 0%    +1.66% (p=0.000 n=20+18)
    Memmove/512-6               21.5GB/s Â± 0%   24.4GB/s Â± 0%   +13.35% (p=0.000 n=18+17)
    Memmove/1024-6              24.7GB/s Â± 0%   33.7GB/s Â± 0%   +36.12% (p=0.000 n=18+18)
    Memmove/2048-6              27.3GB/s Â± 0%   43.3GB/s Â± 0%   +58.77% (p=0.000 n=19+17)
    Memmove/4096-6              37.5GB/s Â± 0%   50.5GB/s Â± 0%   +34.56% (p=0.000 n=19+19)
    MemmoveUnalignedDst/1-6      135MB/s Â± 0%    146MB/s Â± 0%    +7.69% (p=0.000 n=16+14)
    MemmoveUnalignedDst/2-6      271MB/s Â± 0%    292MB/s Â± 0%    +7.69% (p=0.000 n=18+18)
    MemmoveUnalignedDst/3-6      438MB/s Â± 0%    438MB/s Â± 0%      ~ (p=0.352 n=16+19)
    MemmoveUnalignedDst/4-6      584MB/s Â± 0%    584MB/s Â± 0%      ~ (p=0.876 n=17+17)
    MemmoveUnalignedDst/5-6      631MB/s Â± 1%    632MB/s Â± 0%    +0.25% (p=0.000 n=20+17)
    MemmoveUnalignedDst/6-6      759MB/s Â± 0%    759MB/s Â± 0%    +0.00% (p=0.000 n=19+16)
    MemmoveUnalignedDst/7-6      885MB/s Â± 0%    883MB/s Â± 1%      ~ (p=0.647 n=18+20)
    MemmoveUnalignedDst/8-6     1.08GB/s Â± 0%   1.08GB/s Â± 0%    +0.00% (p=0.035 n=19+18)
    MemmoveUnalignedDst/9-6     1.22GB/s Â± 0%   1.22GB/s Â± 0%      ~ (p=0.251 n=18+17)
    MemmoveUnalignedDst/10-6    1.35GB/s Â± 0%   1.35GB/s Â± 0%      ~ (p=0.327 n=17+18)
    MemmoveUnalignedDst/11-6    1.49GB/s Â± 0%   1.49GB/s Â± 0%      ~ (p=0.531 n=18+19)
    MemmoveUnalignedDst/12-6    1.63GB/s Â± 0%   1.63GB/s Â± 0%      ~ (p=0.886 n=19+18)
    MemmoveUnalignedDst/13-6    1.76GB/s Â± 0%   1.76GB/s Â± 1%    -0.24% (p=0.006 n=18+20)
    MemmoveUnalignedDst/14-6    1.90GB/s Â± 0%   1.90GB/s Â± 0%      ~ (p=0.818 n=20+19)
    MemmoveUnalignedDst/15-6    2.03GB/s Â± 0%   2.03GB/s Â± 0%      ~ (p=0.294 n=17+16)
    MemmoveUnalignedDst/16-6    2.17GB/s Â± 0%   2.17GB/s Â± 0%      ~ (p=0.602 n=16+18)
    MemmoveUnalignedDst/32-6    4.05GB/s Â± 0%   4.05GB/s Â± 0%    +0.00% (p=0.010 n=18+17)
    MemmoveUnalignedDst/64-6    7.59GB/s Â± 0%   7.59GB/s Â± 0%    +0.00% (p=0.022 n=18+16)
    MemmoveUnalignedDst/128-6   11.1GB/s Â± 0%   11.4GB/s Â± 0%    +2.79% (p=0.000 n=18+17)
    MemmoveUnalignedDst/256-6   16.4GB/s Â± 0%   16.7GB/s Â± 0%    +1.59% (p=0.000 n=20+17)
    MemmoveUnalignedDst/512-6   15.7GB/s Â± 0%   21.3GB/s Â± 0%   +35.87% (p=0.000 n=18+20)
    MemmoveUnalignedDst/1024-6  16.0GB/s Â±20%   31.5GB/s Â± 0%   +96.93% (p=0.000 n=20+14)
    MemmoveUnalignedDst/2048-6  19.6GB/s Â± 0%   42.1GB/s Â± 0%  +115.16% (p=0.000 n=17+18)
    MemmoveUnalignedDst/4096-6  6.41GB/s Â± 0%  33.18GB/s Â± 0%  +417.56% (p=0.000 n=17+18)
    MemmoveUnalignedSrc/1-6      171MB/s Â± 0%    166MB/s Â± 0%    -3.33% (p=0.000 n=19+16)
    MemmoveUnalignedSrc/2-6      343MB/s Â± 0%    342MB/s Â± 1%    -0.41% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/3-6      508MB/s Â± 0%    493MB/s Â± 1%    -2.90% (p=0.000 n=17+17)
    MemmoveUnalignedSrc/4-6      677MB/s Â± 0%    660MB/s Â± 2%    -2.55% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/5-6      790MB/s Â± 0%    790MB/s Â± 0%      ~ (p=0.139 n=17+17)
    MemmoveUnalignedSrc/6-6      948MB/s Â± 0%    946MB/s Â± 1%      ~ (p=0.330 n=17+19)
    MemmoveUnalignedSrc/7-6     1.11GB/s Â± 0%   1.11GB/s Â± 0%    -0.05% (p=0.026 n=17+17)
    MemmoveUnalignedSrc/8-6     1.38GB/s Â± 0%   1.38GB/s Â± 0%      ~ (p=0.091 n=18+16)
    MemmoveUnalignedSrc/9-6     1.42GB/s Â± 0%   1.40GB/s Â± 1%    -1.04% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/10-6    1.58GB/s Â± 0%   1.56GB/s Â± 1%    -1.15% (p=0.000 n=18+19)
    MemmoveUnalignedSrc/11-6    1.73GB/s Â± 0%   1.71GB/s Â± 1%    -1.30% (p=0.000 n=20+20)
    MemmoveUnalignedSrc/12-6    1.89GB/s Â± 0%   1.87GB/s Â± 1%    -1.18% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/13-6    2.05GB/s Â± 0%   2.02GB/s Â± 1%    -1.18% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/14-6    2.21GB/s Â± 0%   2.18GB/s Â± 1%    -1.14% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/15-6    2.36GB/s Â± 0%   2.34GB/s Â± 1%    -1.04% (p=0.000 n=17+20)
    MemmoveUnalignedSrc/16-6    2.52GB/s Â± 0%   2.49GB/s Â± 1%    -1.26% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/32-6    4.82GB/s Â± 0%   4.61GB/s Â± 0%    -4.40% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/64-6    5.03GB/s Â± 4%   7.97GB/s Â± 0%   +58.55% (p=0.000 n=20+16)
    MemmoveUnalignedSrc/128-6   11.1GB/s Â± 0%   11.2GB/s Â± 0%    +0.52% (p=0.000 n=17+18)
    MemmoveUnalignedSrc/256-6   16.5GB/s Â± 0%   16.4GB/s Â± 0%    -0.10% (p=0.000 n=20+18)
    MemmoveUnalignedSrc/512-6   21.0GB/s Â± 0%   22.1GB/s Â± 0%    +5.48% (p=0.000 n=14+17)
    MemmoveUnalignedSrc/1024-6  24.9GB/s Â± 0%   31.9GB/s Â± 0%   +28.20% (p=0.000 n=19+20)
    MemmoveUnalignedSrc/2048-6  23.3GB/s Â± 0%   33.8GB/s Â± 0%   +45.22% (p=0.000 n=17+19)
    MemmoveUnalignedSrc/4096-6  37.3GB/s Â± 0%   42.7GB/s Â± 0%   +14.30% (p=0.000 n=17+17)

Change-Id: Id66aa3e499ccfb117cb99d623ef326b50d057b64
Reviewed-on: https://go-review.googlesource.com/29590
Run-TryBot: Denis Nagorny <denis.nagorny@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/runtime/cpuflags_amd64.go |  75 +++++++++++
 src/runtime/cpuidlow_amd64.s  |  22 +++
 src/runtime/memmove_amd64.s   | 243 +++++++++++++++++++++++++++++++++-
 src/runtime/memmove_test.go   | 104 +++++++++++++++
 4 files changed, 443 insertions(+), 1 deletion(-)
 create mode 100644 src/runtime/cpuflags_amd64.go
 create mode 100644 src/runtime/cpuidlow_amd64.s

diff --git a/src/runtime/cpuflags_amd64.go b/src/runtime/cpuflags_amd64.go
new file mode 100644
index 0000000000..026f0cd88e
--- /dev/null
+++ b/src/runtime/cpuflags_amd64.go
@@ -0,0 +1,75 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+var vendorStringBytes [12]byte
+var maxInputValue uint32
+var featureFlags uint32
+var processorVersionInfo uint32
+
+var useRepMovs = true
+
+func hasFeature(feature uint32) bool {
+	return (featureFlags & feature) != 0
+}
+
+func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32) // implemented in cpuidlow_amd64.s
+func xgetbv_low(arg1 uint32) (eax, edx uint32)                // implemented in cpuidlow_amd64.s
+
+func init() {
+	const cfOSXSAVE uint32 = 1 << 27
+	const cfAVX uint32 = 1 << 28
+
+	leaf0()
+	leaf1()
+
+	enabledAVX := false
+	// Let's check if OS has set CR4.OSXSAVE[bit 18]
+	// to enable XGETBV instruction.
+	if hasFeature(cfOSXSAVE) {
+		eax, _ := xgetbv_low(0)
+		// Let's check that XCR0[2:1] = â11bâ
+		// i.e. XMM state and YMM state are enabled by OS.
+		enabledAVX = (eax & 0x6) == 0x6
+	}
+
+	isIntelBridgeFamily := (processorVersionInfo == 0x206A0 ||
+		processorVersionInfo == 0x206D0 ||
+		processorVersionInfo == 0x306A0 ||
+		processorVersionInfo == 0x306E0) &&
+		isIntel()
+
+	useRepMovs = !(hasFeature(cfAVX) && enabledAVX) || isIntelBridgeFamily
+}
+
+func leaf0() {
+	eax, ebx, ecx, edx := cpuid_low(0, 0)
+	maxInputValue = eax
+	int32ToBytes(ebx, vendorStringBytes[0:4])
+	int32ToBytes(edx, vendorStringBytes[4:8])
+	int32ToBytes(ecx, vendorStringBytes[8:12])
+}
+
+func leaf1() {
+	if maxInputValue < 1 {
+		return
+	}
+	eax, _, ecx, _ := cpuid_low(1, 0)
+	// Let's remove stepping and reserved fields
+	processorVersionInfo = eax & 0x0FFF3FF0
+	featureFlags = ecx
+}
+
+func int32ToBytes(arg uint32, buffer []byte) {
+	buffer[3] = byte(arg >> 24)
+	buffer[2] = byte(arg >> 16)
+	buffer[1] = byte(arg >> 8)
+	buffer[0] = byte(arg)
+}
+
+func isIntel() bool {
+	intelSignature := [12]byte{'G', 'e', 'n', 'u', 'i', 'n', 'e', 'I', 'n', 't', 'e', 'l'}
+	return vendorStringBytes == intelSignature
+}
diff --git a/src/runtime/cpuidlow_amd64.s b/src/runtime/cpuidlow_amd64.s
new file mode 100644
index 0000000000..64316c9e9f
--- /dev/null
+++ b/src/runtime/cpuidlow_amd64.s
@@ -0,0 +1,22 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT Â·cpuid_low(SB), 4, $0-24
+    MOVL    arg1+0(FP), AX
+    MOVL    arg2+4(FP), CX
+    CPUID
+    MOVL AX, eax+8(FP)
+    MOVL BX, ebx+12(FP)
+    MOVL CX, ecx+16(FP)
+    MOVL DX, edx+20(FP)
+    RET
+// func xgetbv_low(arg1 uint32) (eax, edx uint32)
+TEXT Â·xgetbv_low(SB), 4, $0-16
+    MOVL arg1+0(FP), CX
+    // XGETBV
+    BYTE $0x0F; BYTE $0x01; BYTE $0xD0
+    MOVL AX,eax+8(FP)
+    MOVL DX,edx+12(FP)
+    RET
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index 5d23ce3e6c..464f5fdc1b 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -64,6 +64,9 @@ tail:
 	JBE	move_129through256
 	// TODO: use branch table and BSR to make this just a single dispatch
 
+	TESTB	$1, runtimeÂ·useRepMovs(SB)
+	JZ	avxUnaligned
+
 /*
  * check and set for backwards
  */
@@ -108,7 +111,6 @@ back:
 	ADDQ	BX, CX
 	CMPQ	CX, DI
 	JLS	forward
-	
 /*
  * whole thing backwards has
  * adjusted addresses
@@ -273,3 +275,242 @@ move_256through2048:
 	LEAQ	256(DI), DI
 	JGE	move_256through2048
 	JMP	tail
+
+avxUnaligned:
+	// There are two implementations of move algorithm.
+	// The first one for non-ovelapped memory regions. It uses forward copying.
+	// The second one for overlapped regions. It uses backward copying
+	MOVQ	DI, CX
+	SUBQ	SI, CX
+	// Now CX contains distance between SRC and DEST
+	CMPQ	CX, BX
+	// If the distance lesser than region length it means that regions are overlapped
+	JC	copy_backward
+
+	// Non-temporal copy would be better for big sizes.
+	CMPQ	BX, $0x100000
+	JAE	gobble_big_data_fwd
+
+	// Memory layout on the source side
+	// SI                                       CX
+	// |<---------BX before correction--------->|
+	// |       |<--BX corrected-->|             |
+	// |       |                  |<--- AX  --->|
+	// |<-R11->|                  |<-128 bytes->|
+	// +----------------------------------------+
+	// | Head  | Body             | Tail        |
+	// +-------+------------------+-------------+
+	// ^       ^                  ^
+	// |       |                  |
+	// Save head into Y4          Save tail into X5..X12
+	//         |
+	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
+	// Algorithm:
+	// 1. Unaligned save of the tail's 128 bytes
+	// 2. Unaligned save of the head's 32  bytes
+	// 3. Destination-aligned copying of body (128 bytes per iteration)
+	// 4. Put head on the new place
+	// 5. Put the tail on the new place
+	// It can be important to satisfy processor's pipeline requirements for
+	// small sizes as the cost of unaligned memory region copying is
+	// comparable with the cost of main loop. So code is slightly messed there.
+	// There is more clean implementation of that algorithm for bigger sizes
+	// where the cost of unaligned part copying is negligible.
+	// You can see it after gobble_big_data_fwd label.
+	LEAQ	(SI)(BX*1), CX
+	MOVQ	DI, R10
+	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
+	MOVOU	-0x80(CX), X5
+	MOVOU	-0x70(CX), X6
+	MOVQ	$0x80, AX
+	// Align destination address
+	ANDQ	$-32, DI
+	ADDQ	$32, DI
+	// Continue tail saving.
+	MOVOU	-0x60(CX), X7
+	MOVOU	-0x50(CX), X8
+	// Make R11 delta between aligned and unaligned destination addresses.
+	MOVQ	DI, R11
+	SUBQ	R10, R11
+	// Continue tail saving.
+	MOVOU	-0x40(CX), X9
+	MOVOU	-0x30(CX), X10
+	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
+	SUBQ	R11, BX
+	// Continue tail saving.
+	MOVOU	-0x20(CX), X11
+	MOVOU	-0x10(CX), X12
+	// The tail will be put on it's place after main body copying.
+	// It's time for the unaligned heading part.
+	VMOVDQU	(SI), Y4
+	// Adjust source address to point past head.
+	ADDQ	R11, SI
+	SUBQ	AX, BX
+	// Aligned memory copying there
+gobble_128_loop:
+	VMOVDQU	(SI), Y0
+	VMOVDQU	0x20(SI), Y1
+	VMOVDQU	0x40(SI), Y2
+	VMOVDQU	0x60(SI), Y3
+	ADDQ	AX, SI
+	VMOVDQA	Y0, (DI)
+	VMOVDQA	Y1, 0x20(DI)
+	VMOVDQA	Y2, 0x40(DI)
+	VMOVDQA	Y3, 0x60(DI)
+	ADDQ	AX, DI
+	SUBQ	AX, BX
+	JA	gobble_128_loop
+	// Now we can store unaligned parts.
+	ADDQ	AX, BX
+	ADDQ	DI, BX
+	VMOVDQU	Y4, (R10)
+	VZEROUPPER
+	MOVOU	X5, -0x80(BX)
+	MOVOU	X6, -0x70(BX)
+	MOVOU	X7, -0x60(BX)
+	MOVOU	X8, -0x50(BX)
+	MOVOU	X9, -0x40(BX)
+	MOVOU	X10, -0x30(BX)
+	MOVOU	X11, -0x20(BX)
+	MOVOU	X12, -0x10(BX)
+	RET
+
+gobble_big_data_fwd:
+	// There is forward copying for big regions.
+	// It uses non-temporal mov instructions.
+	// Details of this algorithm are commented previously for small sizes.
+	LEAQ	(SI)(BX*1), CX
+	MOVOU	-0x80(SI)(BX*1), X5
+	MOVOU	-0x70(CX), X6
+	MOVOU	-0x60(CX), X7
+	MOVOU	-0x50(CX), X8
+	MOVOU	-0x40(CX), X9
+	MOVOU	-0x30(CX), X10
+	MOVOU	-0x20(CX), X11
+	MOVOU	-0x10(CX), X12
+	VMOVDQU	(SI), Y4
+	MOVQ	DI, R8
+	ANDQ	$-32, DI
+	ADDQ	$32, DI
+	MOVQ	DI, R10
+	SUBQ	R8, R10
+	SUBQ	R10, BX
+	ADDQ	R10, SI
+	LEAQ	(DI)(BX*1), CX
+	SUBQ	$0x80, BX
+gobble_mem_fwd_loop:
+	PREFETCHNTA 0x1C0(SI)
+	PREFETCHNTA 0x280(SI)
+	// Prefetch values were choosen empirically.
+	// Approach for prefetch usage as in 7.6.6 of [1]
+	// [1] 64-ia-32-architectures-optimization-manual.pdf
+	// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+	VMOVDQU	(SI), Y0
+	VMOVDQU	0x20(SI), Y1
+	VMOVDQU	0x40(SI), Y2
+	VMOVDQU	0x60(SI), Y3
+	ADDQ	$0x80, SI
+	VMOVNTDQ Y0, (DI)
+	VMOVNTDQ Y1, 0x20(DI)
+	VMOVNTDQ Y2, 0x40(DI)
+	VMOVNTDQ Y3, 0x60(DI)
+	ADDQ	$0x80, DI
+	SUBQ	$0x80, BX
+	JA		gobble_mem_fwd_loop
+	// NT instructions don't follow the normal cache-coherency rules.
+	// We need SFENCE there to make copied data available timely.
+	SFENCE
+	VMOVDQU	Y4, (R8)
+	VZEROUPPER
+	MOVOU	X5, -0x80(CX)
+	MOVOU	X6, -0x70(CX)
+	MOVOU	X7, -0x60(CX)
+	MOVOU	X8, -0x50(CX)
+	MOVOU	X9, -0x40(CX)
+	MOVOU	X10, -0x30(CX)
+	MOVOU	X11, -0x20(CX)
+	MOVOU	X12, -0x10(CX)
+	RET
+
+copy_backward:
+	MOVQ	DI, AX
+	// Backward copying is about the same as the forward one.
+	// Firstly we load unaligned tail in the beginning of region.
+	MOVOU	(SI), X5
+	MOVOU	0x10(SI), X6
+	ADDQ	BX, DI
+	MOVOU	0x20(SI), X7
+	MOVOU	0x30(SI), X8
+	LEAQ	-0x20(DI), R10
+	MOVQ	DI, R11
+	MOVOU	0x40(SI), X9
+	MOVOU	0x50(SI), X10
+	ANDQ	$0x1F, R11
+	MOVOU	0x60(SI), X11
+	MOVOU	0x70(SI), X12
+	XORQ	R11, DI
+	// Let's point SI to the end of region
+	ADDQ	BX, SI
+	// and load unaligned head into X4.
+	VMOVDQU	-0x20(SI), Y4
+	SUBQ	R11, SI
+	SUBQ	R11, BX
+	// If there is enough data for non-temporal moves go to special loop
+	CMPQ	BX, $0x100000
+	JA		gobble_big_data_bwd
+	SUBQ	$0x80, BX
+gobble_mem_bwd_loop:
+	VMOVDQU	-0x20(SI), Y0
+	VMOVDQU	-0x40(SI), Y1
+	VMOVDQU	-0x60(SI), Y2
+	VMOVDQU	-0x80(SI), Y3
+	SUBQ	$0x80, SI
+	VMOVDQA	Y0, -0x20(DI)
+	VMOVDQA	Y1, -0x40(DI)
+	VMOVDQA	Y2, -0x60(DI)
+	VMOVDQA	Y3, -0x80(DI)
+	SUBQ	$0x80, DI
+	SUBQ	$0x80, BX
+	JA		gobble_mem_bwd_loop
+	// Let's store unaligned data
+	VMOVDQU	Y4, (R10)
+	VZEROUPPER
+	MOVOU	X5, (AX)
+	MOVOU	X6, 0x10(AX)
+	MOVOU	X7, 0x20(AX)
+	MOVOU	X8, 0x30(AX)
+	MOVOU	X9, 0x40(AX)
+	MOVOU	X10, 0x50(AX)
+	MOVOU	X11, 0x60(AX)
+	MOVOU	X12, 0x70(AX)
+	RET
+
+gobble_big_data_bwd:
+	SUBQ	$0x80, BX
+gobble_big_mem_bwd_loop:
+	PREFETCHNTA -0x1C0(SI)
+	PREFETCHNTA -0x280(SI)
+	VMOVDQU	-0x20(SI), Y0
+	VMOVDQU	-0x40(SI), Y1
+	VMOVDQU	-0x60(SI), Y2
+	VMOVDQU	-0x80(SI), Y3
+	SUBQ	$0x80, SI
+	VMOVNTDQ	Y0, -0x20(DI)
+	VMOVNTDQ	Y1, -0x40(DI)
+	VMOVNTDQ	Y2, -0x60(DI)
+	VMOVNTDQ	Y3, -0x80(DI)
+	SUBQ	$0x80, DI
+	SUBQ	$0x80, BX
+	JA	gobble_big_mem_bwd_loop
+	SFENCE
+	VMOVDQU	Y4, (R10)
+	VZEROUPPER
+	MOVOU	X5, (AX)
+	MOVOU	X6, 0x10(AX)
+	MOVOU	X7, 0x20(AX)
+	MOVOU	X8, 0x30(AX)
+	MOVOU	X9, 0x40(AX)
+	MOVOU	X10, 0x50(AX)
+	MOVOU	X11, 0x60(AX)
+	MOVOU	X12, 0x70(AX)
+	RET
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 2124cb9d49..080ca28667 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -5,7 +5,9 @@
 package runtime_test
 
 import (
+	"crypto/rand"
 	"fmt"
+	"internal/race"
 	. "runtime"
 	"testing"
 )
@@ -82,6 +84,108 @@ func TestMemmoveAlias(t *testing.T) {
 	}
 }
 
+func TestMemmoveLarge0x180000(t *testing.T) {
+	if race.Enabled {
+		t.Skip("skipping large memmove test under race detector")
+	}
+	testSize(t, 0x180000)
+}
+
+func TestMemmoveOverlapLarge0x120000(t *testing.T) {
+	if race.Enabled {
+		t.Skip("skipping large memmove test under race detector")
+	}
+	testOverlap(t, 0x120000)
+}
+
+func testSize(t *testing.T, size int) {
+	src := make([]byte, size)
+	dst := make([]byte, size)
+	_, _ = rand.Read(src)
+	_, _ = rand.Read(dst)
+
+	ref := make([]byte, size)
+	copyref(ref, dst)
+
+	for n := size - 50; n > 1; n >>= 1 {
+		for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+			for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+				copy(dst[y:y+n], src[x:x+n])
+				copyref(ref[y:y+n], src[x:x+n])
+				p := cmpb(dst, ref)
+				if p >= 0 {
+					t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
+				}
+			}
+		}
+	}
+}
+
+func testOverlap(t *testing.T, size int) {
+	src := make([]byte, size)
+	test := make([]byte, size)
+	ref := make([]byte, size)
+	_, _ = rand.Read(src)
+
+	for n := size - 50; n > 1; n >>= 1 {
+		for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+			for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+				// Reset input
+				copyref(test, src)
+				copyref(ref, src)
+				copy(test[y:y+n], test[x:x+n])
+				if y <= x {
+					copyref(ref[y:y+n], ref[x:x+n])
+				} else {
+					copybw(ref[y:y+n], ref[x:x+n])
+				}
+				p := cmpb(test, ref)
+				if p >= 0 {
+					t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
+				}
+			}
+		}
+	}
+
+}
+
+// Forward copy.
+func copyref(dst, src []byte) {
+	for i, v := range src {
+		dst[i] = v
+	}
+}
+
+// Backwards copy
+func copybw(dst, src []byte) {
+	if len(src) == 0 {
+		return
+	}
+	for i := len(src) - 1; i >= 0; i-- {
+		dst[i] = src[i]
+	}
+}
+
+// Returns offset of difference
+func matchLen(a, b []byte, max int) int {
+	a = a[:max]
+	b = b[:max]
+	for i, av := range a {
+		if b[i] != av {
+			return i
+		}
+	}
+	return max
+}
+
+func cmpb(a, b []byte) int {
+	l := matchLen(a, b, len(a))
+	if l == len(a) {
+		return -1
+	}
+	return l
+}
+
 func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
 	for _, n := range sizes {
 		b.Run(fmt.Sprint(n), func(b *testing.B) {
-- 
2.52.0