]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: improve memmove for amd64
authorDenis Nagorny <denis.nagorny@intel.com>
Thu, 28 Apr 2016 09:25:46 +0000 (12:25 +0300)
committerKeith Randall <khr@golang.org>
Wed, 31 Aug 2016 16:03:30 +0000 (16:03 +0000)
Use AVX if available on 4th generation of Intel(TM) Core(TM) processors.

(collected on E5 2609v3 @1.9GHz)
name                        old speed      new speed       delta
Memmove/1-6                  158MB/s ± 0%    172MB/s ± 0%    +9.09% (p=0.000 n=16+16)
Memmove/2-6                  316MB/s ± 0%    345MB/s ± 0%    +9.09% (p=0.000 n=18+16)
Memmove/3-6                  517MB/s ± 0%    517MB/s ± 0%      ~ (p=0.445 n=16+16)
Memmove/4-6                  687MB/s ± 1%    690MB/s ± 0%    +0.35% (p=0.000 n=20+17)
Memmove/5-6                  729MB/s ± 0%    729MB/s ± 0%    +0.01% (p=0.000 n=16+18)
Memmove/6-6                  875MB/s ± 0%    875MB/s ± 0%    +0.01% (p=0.000 n=18+18)
Memmove/7-6                 1.02GB/s ± 0%   1.02GB/s ± 1%      ~ (p=0.139 n=19+20)
Memmove/8-6                 1.26GB/s ± 0%   1.26GB/s ± 0%    +0.00% (p=0.000 n=18+18)
Memmove/9-6                 1.42GB/s ± 0%   1.42GB/s ± 0%    +0.00% (p=0.000 n=17+18)
Memmove/10-6                1.58GB/s ± 0%   1.58GB/s ± 0%    +0.00% (p=0.000 n=19+19)
Memmove/11-6                1.74GB/s ± 0%   1.74GB/s ± 0%    +0.00% (p=0.001 n=18+17)
Memmove/12-6                1.90GB/s ± 0%   1.90GB/s ± 0%    +0.00% (p=0.000 n=19+19)
Memmove/13-6                2.05GB/s ± 0%   2.05GB/s ± 0%    +0.00% (p=0.000 n=18+19)
Memmove/14-6                2.21GB/s ± 0%   2.21GB/s ± 0%    +0.00% (p=0.000 n=16+20)
Memmove/15-6                2.37GB/s ± 0%   2.37GB/s ± 0%    +0.00% (p=0.004 n=19+20)
Memmove/16-6                2.53GB/s ± 0%   2.53GB/s ± 0%    +0.00% (p=0.000 n=16+16)
Memmove/32-6                4.67GB/s ± 0%   4.67GB/s ± 0%    +0.00% (p=0.000 n=17+17)
Memmove/64-6                8.67GB/s ± 0%   8.64GB/s ± 0%    -0.33% (p=0.000 n=18+17)
Memmove/128-6               12.6GB/s ± 0%   11.6GB/s ± 0%    -8.05% (p=0.000 n=16+19)
Memmove/256-6               16.3GB/s ± 0%   16.6GB/s ± 0%    +1.66% (p=0.000 n=20+18)
Memmove/512-6               21.5GB/s ± 0%   24.4GB/s ± 0%   +13.35% (p=0.000 n=18+17)
Memmove/1024-6              24.7GB/s ± 0%   33.7GB/s ± 0%   +36.12% (p=0.000 n=18+18)
Memmove/2048-6              27.3GB/s ± 0%   43.3GB/s ± 0%   +58.77% (p=0.000 n=19+17)
Memmove/4096-6              37.5GB/s ± 0%   50.5GB/s ± 0%   +34.56% (p=0.000 n=19+19)
MemmoveUnalignedDst/1-6      135MB/s ± 0%    146MB/s ± 0%    +7.69% (p=0.000 n=16+14)
MemmoveUnalignedDst/2-6      271MB/s ± 0%    292MB/s ± 0%    +7.69% (p=0.000 n=18+18)
MemmoveUnalignedDst/3-6      438MB/s ± 0%    438MB/s ± 0%      ~ (p=0.352 n=16+19)
MemmoveUnalignedDst/4-6      584MB/s ± 0%    584MB/s ± 0%      ~ (p=0.876 n=17+17)
MemmoveUnalignedDst/5-6      631MB/s ± 1%    632MB/s ± 0%    +0.25% (p=0.000 n=20+17)
MemmoveUnalignedDst/6-6      759MB/s ± 0%    759MB/s ± 0%    +0.00% (p=0.000 n=19+16)
MemmoveUnalignedDst/7-6      885MB/s ± 0%    883MB/s ± 1%      ~ (p=0.647 n=18+20)
MemmoveUnalignedDst/8-6     1.08GB/s ± 0%   1.08GB/s ± 0%    +0.00% (p=0.035 n=19+18)
MemmoveUnalignedDst/9-6     1.22GB/s ± 0%   1.22GB/s ± 0%      ~ (p=0.251 n=18+17)
MemmoveUnalignedDst/10-6    1.35GB/s ± 0%   1.35GB/s ± 0%      ~ (p=0.327 n=17+18)
MemmoveUnalignedDst/11-6    1.49GB/s ± 0%   1.49GB/s ± 0%      ~ (p=0.531 n=18+19)
MemmoveUnalignedDst/12-6    1.63GB/s ± 0%   1.63GB/s ± 0%      ~ (p=0.886 n=19+18)
MemmoveUnalignedDst/13-6    1.76GB/s ± 0%   1.76GB/s ± 1%    -0.24% (p=0.006 n=18+20)
MemmoveUnalignedDst/14-6    1.90GB/s ± 0%   1.90GB/s ± 0%      ~ (p=0.818 n=20+19)
MemmoveUnalignedDst/15-6    2.03GB/s ± 0%   2.03GB/s ± 0%      ~ (p=0.294 n=17+16)
MemmoveUnalignedDst/16-6    2.17GB/s ± 0%   2.17GB/s ± 0%      ~ (p=0.602 n=16+18)
MemmoveUnalignedDst/32-6    4.05GB/s ± 0%   4.05GB/s ± 0%    +0.00% (p=0.010 n=18+17)
MemmoveUnalignedDst/64-6    7.59GB/s ± 0%   7.59GB/s ± 0%    +0.00% (p=0.022 n=18+16)
MemmoveUnalignedDst/128-6   11.1GB/s ± 0%   11.4GB/s ± 0%    +2.79% (p=0.000 n=18+17)
MemmoveUnalignedDst/256-6   16.4GB/s ± 0%   16.7GB/s ± 0%    +1.59% (p=0.000 n=20+17)
MemmoveUnalignedDst/512-6   15.7GB/s ± 0%   21.3GB/s ± 0%   +35.87% (p=0.000 n=18+20)
MemmoveUnalignedDst/1024-6  16.0GB/s ±20%   31.5GB/s ± 0%   +96.93% (p=0.000 n=20+14)
MemmoveUnalignedDst/2048-6  19.6GB/s ± 0%   42.1GB/s ± 0%  +115.16% (p=0.000 n=17+18)
MemmoveUnalignedDst/4096-6  6.41GB/s ± 0%  33.18GB/s ± 0%  +417.56% (p=0.000 n=17+18)
MemmoveUnalignedSrc/1-6      171MB/s ± 0%    166MB/s ± 0%    -3.33% (p=0.000 n=19+16)
MemmoveUnalignedSrc/2-6      343MB/s ± 0%    342MB/s ± 1%    -0.41% (p=0.000 n=17+20)
MemmoveUnalignedSrc/3-6      508MB/s ± 0%    493MB/s ± 1%    -2.90% (p=0.000 n=17+17)
MemmoveUnalignedSrc/4-6      677MB/s ± 0%    660MB/s ± 2%    -2.55% (p=0.000 n=17+20)
MemmoveUnalignedSrc/5-6      790MB/s ± 0%    790MB/s ± 0%      ~ (p=0.139 n=17+17)
MemmoveUnalignedSrc/6-6      948MB/s ± 0%    946MB/s ± 1%      ~ (p=0.330 n=17+19)
MemmoveUnalignedSrc/7-6     1.11GB/s ± 0%   1.11GB/s ± 0%    -0.05% (p=0.026 n=17+17)
MemmoveUnalignedSrc/8-6     1.38GB/s ± 0%   1.38GB/s ± 0%      ~ (p=0.091 n=18+16)
MemmoveUnalignedSrc/9-6     1.42GB/s ± 0%   1.40GB/s ± 1%    -1.04% (p=0.000 n=19+20)
MemmoveUnalignedSrc/10-6    1.58GB/s ± 0%   1.56GB/s ± 1%    -1.15% (p=0.000 n=18+19)
MemmoveUnalignedSrc/11-6    1.73GB/s ± 0%   1.71GB/s ± 1%    -1.30% (p=0.000 n=20+20)
MemmoveUnalignedSrc/12-6    1.89GB/s ± 0%   1.87GB/s ± 1%    -1.18% (p=0.000 n=17+20)
MemmoveUnalignedSrc/13-6    2.05GB/s ± 0%   2.02GB/s ± 1%    -1.18% (p=0.000 n=17+20)
MemmoveUnalignedSrc/14-6    2.21GB/s ± 0%   2.18GB/s ± 1%    -1.14% (p=0.000 n=17+20)
MemmoveUnalignedSrc/15-6    2.36GB/s ± 0%   2.34GB/s ± 1%    -1.04% (p=0.000 n=17+20)
MemmoveUnalignedSrc/16-6    2.52GB/s ± 0%   2.49GB/s ± 1%    -1.26% (p=0.000 n=19+20)
MemmoveUnalignedSrc/32-6    4.82GB/s ± 0%   4.61GB/s ± 0%    -4.40% (p=0.000 n=19+20)
MemmoveUnalignedSrc/64-6    5.03GB/s ± 4%   7.97GB/s ± 0%   +58.55% (p=0.000 n=20+16)
MemmoveUnalignedSrc/128-6   11.1GB/s ± 0%   11.2GB/s ± 0%    +0.52% (p=0.000 n=17+18)
MemmoveUnalignedSrc/256-6   16.5GB/s ± 0%   16.4GB/s ± 0%    -0.10% (p=0.000 n=20+18)
MemmoveUnalignedSrc/512-6   21.0GB/s ± 0%   22.1GB/s ± 0%    +5.48% (p=0.000 n=14+17)
MemmoveUnalignedSrc/1024-6  24.9GB/s ± 0%   31.9GB/s ± 0%   +28.20% (p=0.000 n=19+20)
MemmoveUnalignedSrc/2048-6  23.3GB/s ± 0%   33.8GB/s ± 0%   +45.22% (p=0.000 n=17+19)
MemmoveUnalignedSrc/4096-6  37.3GB/s ± 0%   42.7GB/s ± 0%   +14.30% (p=0.000 n=17+17)

Change-Id: Iab488d93a293cdf573ab5cd89b95a818bbb5d531
Reviewed-on: https://go-review.googlesource.com/22515
Run-TryBot: Denis Nagorny <denis.nagorny@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/runtime/cpuflags_amd64.go [new file with mode: 0644]
src/runtime/cpuidlow_amd64.s [new file with mode: 0644]
src/runtime/memmove_amd64.s
src/runtime/memmove_test.go [changed mode: 0644->0755]

diff --git a/src/runtime/cpuflags_amd64.go b/src/runtime/cpuflags_amd64.go
new file mode 100644 (file)
index 0000000..277b42c
--- /dev/null
@@ -0,0 +1,75 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+var vendorStringBytes [12]byte
+var maxInputValue uint32
+var featureFlags uint32
+var processorVersionInfo uint32
+
+var useRepMovs bool
+
+func hasFeature(feature uint32) bool {
+       return (featureFlags & feature) != 0
+}
+
+func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32) // implemented in cpuidlow_amd64.s
+func xgetbv_low(arg1 uint32) (eax, edx uint32)                // implemented in cpuidlow_amd64.s
+
+func init() {
+       const cfOSXSAVE uint32 = 1 << 27
+       const cfAVX uint32 = 1 << 28
+
+       leaf0()
+       leaf1()
+
+       enabledAVX := false
+       // Let's check if OS has set CR4.OSXSAVE[bit 18]
+       // to enable XGETBV instruction.
+       if hasFeature(cfOSXSAVE) {
+               eax, _ := xgetbv_low(0)
+               // Let's check that XCR0[2:1] = ‘11b’
+               // i.e. XMM state and YMM state are enabled by OS.
+               enabledAVX = (eax & 0x6) == 0x6
+       }
+
+       isIntelBridgeFamily := (processorVersionInfo == 0x206A0 ||
+               processorVersionInfo == 0x206D0 ||
+               processorVersionInfo == 0x306A0 ||
+               processorVersionInfo == 0x306E0) &&
+               isIntel()
+
+       useRepMovs = !(hasFeature(cfAVX) && enabledAVX) || isIntelBridgeFamily
+}
+
+func leaf0() {
+       eax, ebx, ecx, edx := cpuid_low(0, 0)
+       maxInputValue = eax
+       int32ToBytes(ebx, vendorStringBytes[0:4])
+       int32ToBytes(edx, vendorStringBytes[4:8])
+       int32ToBytes(ecx, vendorStringBytes[8:12])
+}
+
+func leaf1() {
+       if maxInputValue < 1 {
+               return
+       }
+       eax, _, ecx, _ := cpuid_low(1, 0)
+       // Let's remove stepping and reserved fields
+       processorVersionInfo = eax & 0x0FFF3FF0
+       featureFlags = ecx
+}
+
+func int32ToBytes(arg uint32, buffer []byte) {
+       buffer[3] = byte(arg >> 24)
+       buffer[2] = byte(arg >> 16)
+       buffer[1] = byte(arg >> 8)
+       buffer[0] = byte(arg)
+}
+
+func isIntel() bool {
+       intelSignature := [12]byte{'G', 'e', 'n', 'u', 'i', 'n', 'e', 'I', 'n', 't', 'e', 'l'}
+       return vendorStringBytes == intelSignature
+}
diff --git a/src/runtime/cpuidlow_amd64.s b/src/runtime/cpuidlow_amd64.s
new file mode 100644 (file)
index 0000000..64316c9
--- /dev/null
@@ -0,0 +1,22 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid_low(SB), 4, $0-24
+    MOVL    arg1+0(FP), AX
+    MOVL    arg2+4(FP), CX
+    CPUID
+    MOVL AX, eax+8(FP)
+    MOVL BX, ebx+12(FP)
+    MOVL CX, ecx+16(FP)
+    MOVL DX, edx+20(FP)
+    RET
+// func xgetbv_low(arg1 uint32) (eax, edx uint32)
+TEXT ·xgetbv_low(SB), 4, $0-16
+    MOVL arg1+0(FP), CX
+    // XGETBV
+    BYTE $0x0F; BYTE $0x01; BYTE $0xD0
+    MOVL AX,eax+8(FP)
+    MOVL DX,edx+12(FP)
+    RET
index 5d23ce3e6c44de84269acb2594b6533ada8dcf5e..464f5fdc1b48b549030f885cb203291c92b2890c 100644 (file)
@@ -64,6 +64,9 @@ tail:
        JBE     move_129through256
        // TODO: use branch table and BSR to make this just a single dispatch
 
+       TESTB   $1, runtime·useRepMovs(SB)
+       JZ      avxUnaligned
+
 /*
  * check and set for backwards
  */
@@ -108,7 +111,6 @@ back:
        ADDQ    BX, CX
        CMPQ    CX, DI
        JLS     forward
-       
 /*
  * whole thing backwards has
  * adjusted addresses
@@ -273,3 +275,242 @@ move_256through2048:
        LEAQ    256(DI), DI
        JGE     move_256through2048
        JMP     tail
+
+avxUnaligned:
+       // There are two implementations of move algorithm.
+       // The first one for non-ovelapped memory regions. It uses forward copying.
+       // The second one for overlapped regions. It uses backward copying
+       MOVQ    DI, CX
+       SUBQ    SI, CX
+       // Now CX contains distance between SRC and DEST
+       CMPQ    CX, BX
+       // If the distance lesser than region length it means that regions are overlapped
+       JC      copy_backward
+
+       // Non-temporal copy would be better for big sizes.
+       CMPQ    BX, $0x100000
+       JAE     gobble_big_data_fwd
+
+       // Memory layout on the source side
+       // SI                                       CX
+       // |<---------BX before correction--------->|
+       // |       |<--BX corrected-->|             |
+       // |       |                  |<--- AX  --->|
+       // |<-R11->|                  |<-128 bytes->|
+       // +----------------------------------------+
+       // | Head  | Body             | Tail        |
+       // +-------+------------------+-------------+
+       // ^       ^                  ^
+       // |       |                  |
+       // Save head into Y4          Save tail into X5..X12
+       //         |
+       //         SI+R11, where R11 = ((DI & -32) + 32) - DI
+       // Algorithm:
+       // 1. Unaligned save of the tail's 128 bytes
+       // 2. Unaligned save of the head's 32  bytes
+       // 3. Destination-aligned copying of body (128 bytes per iteration)
+       // 4. Put head on the new place
+       // 5. Put the tail on the new place
+       // It can be important to satisfy processor's pipeline requirements for
+       // small sizes as the cost of unaligned memory region copying is
+       // comparable with the cost of main loop. So code is slightly messed there.
+       // There is more clean implementation of that algorithm for bigger sizes
+       // where the cost of unaligned part copying is negligible.
+       // You can see it after gobble_big_data_fwd label.
+       LEAQ    (SI)(BX*1), CX
+       MOVQ    DI, R10
+       // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
+       MOVOU   -0x80(CX), X5
+       MOVOU   -0x70(CX), X6
+       MOVQ    $0x80, AX
+       // Align destination address
+       ANDQ    $-32, DI
+       ADDQ    $32, DI
+       // Continue tail saving.
+       MOVOU   -0x60(CX), X7
+       MOVOU   -0x50(CX), X8
+       // Make R11 delta between aligned and unaligned destination addresses.
+       MOVQ    DI, R11
+       SUBQ    R10, R11
+       // Continue tail saving.
+       MOVOU   -0x40(CX), X9
+       MOVOU   -0x30(CX), X10
+       // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
+       SUBQ    R11, BX
+       // Continue tail saving.
+       MOVOU   -0x20(CX), X11
+       MOVOU   -0x10(CX), X12
+       // The tail will be put on it's place after main body copying.
+       // It's time for the unaligned heading part.
+       VMOVDQU (SI), Y4
+       // Adjust source address to point past head.
+       ADDQ    R11, SI
+       SUBQ    AX, BX
+       // Aligned memory copying there
+gobble_128_loop:
+       VMOVDQU (SI), Y0
+       VMOVDQU 0x20(SI), Y1
+       VMOVDQU 0x40(SI), Y2
+       VMOVDQU 0x60(SI), Y3
+       ADDQ    AX, SI
+       VMOVDQA Y0, (DI)
+       VMOVDQA Y1, 0x20(DI)
+       VMOVDQA Y2, 0x40(DI)
+       VMOVDQA Y3, 0x60(DI)
+       ADDQ    AX, DI
+       SUBQ    AX, BX
+       JA      gobble_128_loop
+       // Now we can store unaligned parts.
+       ADDQ    AX, BX
+       ADDQ    DI, BX
+       VMOVDQU Y4, (R10)
+       VZEROUPPER
+       MOVOU   X5, -0x80(BX)
+       MOVOU   X6, -0x70(BX)
+       MOVOU   X7, -0x60(BX)
+       MOVOU   X8, -0x50(BX)
+       MOVOU   X9, -0x40(BX)
+       MOVOU   X10, -0x30(BX)
+       MOVOU   X11, -0x20(BX)
+       MOVOU   X12, -0x10(BX)
+       RET
+
+gobble_big_data_fwd:
+       // There is forward copying for big regions.
+       // It uses non-temporal mov instructions.
+       // Details of this algorithm are commented previously for small sizes.
+       LEAQ    (SI)(BX*1), CX
+       MOVOU   -0x80(SI)(BX*1), X5
+       MOVOU   -0x70(CX), X6
+       MOVOU   -0x60(CX), X7
+       MOVOU   -0x50(CX), X8
+       MOVOU   -0x40(CX), X9
+       MOVOU   -0x30(CX), X10
+       MOVOU   -0x20(CX), X11
+       MOVOU   -0x10(CX), X12
+       VMOVDQU (SI), Y4
+       MOVQ    DI, R8
+       ANDQ    $-32, DI
+       ADDQ    $32, DI
+       MOVQ    DI, R10
+       SUBQ    R8, R10
+       SUBQ    R10, BX
+       ADDQ    R10, SI
+       LEAQ    (DI)(BX*1), CX
+       SUBQ    $0x80, BX
+gobble_mem_fwd_loop:
+       PREFETCHNTA 0x1C0(SI)
+       PREFETCHNTA 0x280(SI)
+       // Prefetch values were choosen empirically.
+       // Approach for prefetch usage as in 7.6.6 of [1]
+       // [1] 64-ia-32-architectures-optimization-manual.pdf
+       // http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+       VMOVDQU (SI), Y0
+       VMOVDQU 0x20(SI), Y1
+       VMOVDQU 0x40(SI), Y2
+       VMOVDQU 0x60(SI), Y3
+       ADDQ    $0x80, SI
+       VMOVNTDQ Y0, (DI)
+       VMOVNTDQ Y1, 0x20(DI)
+       VMOVNTDQ Y2, 0x40(DI)
+       VMOVNTDQ Y3, 0x60(DI)
+       ADDQ    $0x80, DI
+       SUBQ    $0x80, BX
+       JA              gobble_mem_fwd_loop
+       // NT instructions don't follow the normal cache-coherency rules.
+       // We need SFENCE there to make copied data available timely.
+       SFENCE
+       VMOVDQU Y4, (R8)
+       VZEROUPPER
+       MOVOU   X5, -0x80(CX)
+       MOVOU   X6, -0x70(CX)
+       MOVOU   X7, -0x60(CX)
+       MOVOU   X8, -0x50(CX)
+       MOVOU   X9, -0x40(CX)
+       MOVOU   X10, -0x30(CX)
+       MOVOU   X11, -0x20(CX)
+       MOVOU   X12, -0x10(CX)
+       RET
+
+copy_backward:
+       MOVQ    DI, AX
+       // Backward copying is about the same as the forward one.
+       // Firstly we load unaligned tail in the beginning of region.
+       MOVOU   (SI), X5
+       MOVOU   0x10(SI), X6
+       ADDQ    BX, DI
+       MOVOU   0x20(SI), X7
+       MOVOU   0x30(SI), X8
+       LEAQ    -0x20(DI), R10
+       MOVQ    DI, R11
+       MOVOU   0x40(SI), X9
+       MOVOU   0x50(SI), X10
+       ANDQ    $0x1F, R11
+       MOVOU   0x60(SI), X11
+       MOVOU   0x70(SI), X12
+       XORQ    R11, DI
+       // Let's point SI to the end of region
+       ADDQ    BX, SI
+       // and load unaligned head into X4.
+       VMOVDQU -0x20(SI), Y4
+       SUBQ    R11, SI
+       SUBQ    R11, BX
+       // If there is enough data for non-temporal moves go to special loop
+       CMPQ    BX, $0x100000
+       JA              gobble_big_data_bwd
+       SUBQ    $0x80, BX
+gobble_mem_bwd_loop:
+       VMOVDQU -0x20(SI), Y0
+       VMOVDQU -0x40(SI), Y1
+       VMOVDQU -0x60(SI), Y2
+       VMOVDQU -0x80(SI), Y3
+       SUBQ    $0x80, SI
+       VMOVDQA Y0, -0x20(DI)
+       VMOVDQA Y1, -0x40(DI)
+       VMOVDQA Y2, -0x60(DI)
+       VMOVDQA Y3, -0x80(DI)
+       SUBQ    $0x80, DI
+       SUBQ    $0x80, BX
+       JA              gobble_mem_bwd_loop
+       // Let's store unaligned data
+       VMOVDQU Y4, (R10)
+       VZEROUPPER
+       MOVOU   X5, (AX)
+       MOVOU   X6, 0x10(AX)
+       MOVOU   X7, 0x20(AX)
+       MOVOU   X8, 0x30(AX)
+       MOVOU   X9, 0x40(AX)
+       MOVOU   X10, 0x50(AX)
+       MOVOU   X11, 0x60(AX)
+       MOVOU   X12, 0x70(AX)
+       RET
+
+gobble_big_data_bwd:
+       SUBQ    $0x80, BX
+gobble_big_mem_bwd_loop:
+       PREFETCHNTA -0x1C0(SI)
+       PREFETCHNTA -0x280(SI)
+       VMOVDQU -0x20(SI), Y0
+       VMOVDQU -0x40(SI), Y1
+       VMOVDQU -0x60(SI), Y2
+       VMOVDQU -0x80(SI), Y3
+       SUBQ    $0x80, SI
+       VMOVNTDQ        Y0, -0x20(DI)
+       VMOVNTDQ        Y1, -0x40(DI)
+       VMOVNTDQ        Y2, -0x60(DI)
+       VMOVNTDQ        Y3, -0x80(DI)
+       SUBQ    $0x80, DI
+       SUBQ    $0x80, BX
+       JA      gobble_big_mem_bwd_loop
+       SFENCE
+       VMOVDQU Y4, (R10)
+       VZEROUPPER
+       MOVOU   X5, (AX)
+       MOVOU   X6, 0x10(AX)
+       MOVOU   X7, 0x20(AX)
+       MOVOU   X8, 0x30(AX)
+       MOVOU   X9, 0x40(AX)
+       MOVOU   X10, 0x50(AX)
+       MOVOU   X11, 0x60(AX)
+       MOVOU   X12, 0x70(AX)
+       RET
old mode 100644 (file)
new mode 100755 (executable)
index 2124cb9..080ca28
@@ -5,7 +5,9 @@
 package runtime_test
 
 import (
+       "crypto/rand"
        "fmt"
+       "internal/race"
        . "runtime"
        "testing"
 )
@@ -82,6 +84,108 @@ func TestMemmoveAlias(t *testing.T) {
        }
 }
 
+func TestMemmoveLarge0x180000(t *testing.T) {
+       if race.Enabled {
+               t.Skip("skipping large memmove test under race detector")
+       }
+       testSize(t, 0x180000)
+}
+
+func TestMemmoveOverlapLarge0x120000(t *testing.T) {
+       if race.Enabled {
+               t.Skip("skipping large memmove test under race detector")
+       }
+       testOverlap(t, 0x120000)
+}
+
+func testSize(t *testing.T, size int) {
+       src := make([]byte, size)
+       dst := make([]byte, size)
+       _, _ = rand.Read(src)
+       _, _ = rand.Read(dst)
+
+       ref := make([]byte, size)
+       copyref(ref, dst)
+
+       for n := size - 50; n > 1; n >>= 1 {
+               for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+                       for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+                               copy(dst[y:y+n], src[x:x+n])
+                               copyref(ref[y:y+n], src[x:x+n])
+                               p := cmpb(dst, ref)
+                               if p >= 0 {
+                                       t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
+                               }
+                       }
+               }
+       }
+}
+
+func testOverlap(t *testing.T, size int) {
+       src := make([]byte, size)
+       test := make([]byte, size)
+       ref := make([]byte, size)
+       _, _ = rand.Read(src)
+
+       for n := size - 50; n > 1; n >>= 1 {
+               for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+                       for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+                               // Reset input
+                               copyref(test, src)
+                               copyref(ref, src)
+                               copy(test[y:y+n], test[x:x+n])
+                               if y <= x {
+                                       copyref(ref[y:y+n], ref[x:x+n])
+                               } else {
+                                       copybw(ref[y:y+n], ref[x:x+n])
+                               }
+                               p := cmpb(test, ref)
+                               if p >= 0 {
+                                       t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
+                               }
+                       }
+               }
+       }
+
+}
+
+// Forward copy.
+func copyref(dst, src []byte) {
+       for i, v := range src {
+               dst[i] = v
+       }
+}
+
+// Backwards copy
+func copybw(dst, src []byte) {
+       if len(src) == 0 {
+               return
+       }
+       for i := len(src) - 1; i >= 0; i-- {
+               dst[i] = src[i]
+       }
+}
+
+// Returns offset of difference
+func matchLen(a, b []byte, max int) int {
+       a = a[:max]
+       b = b[:max]
+       for i, av := range a {
+               if b[i] != av {
+                       return i
+               }
+       }
+       return max
+}
+
+func cmpb(a, b []byte) int {
+       l := matchLen(a, b, len(a))
+       if l == len(a) {
+               return -1
+       }
+       return l
+}
+
 func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
        for _, n := range sizes {
                b.Run(fmt.Sprint(n), func(b *testing.B) {