Use AVX if available on 4th generation of Intel(TM) Core(TM) processors.
(collected on E5 2609v3 @1.9GHz)
name old speed new speed delta
Memmove/1-6 158MB/s ± 0% 172MB/s ± 0% +9.09% (p=0.000 n=16+16)
Memmove/2-6 316MB/s ± 0% 345MB/s ± 0% +9.09% (p=0.000 n=18+16)
Memmove/3-6 517MB/s ± 0% 517MB/s ± 0% ~ (p=0.445 n=16+16)
Memmove/4-6 687MB/s ± 1% 690MB/s ± 0% +0.35% (p=0.000 n=20+17)
Memmove/5-6 729MB/s ± 0% 729MB/s ± 0% +0.01% (p=0.000 n=16+18)
Memmove/6-6 875MB/s ± 0% 875MB/s ± 0% +0.01% (p=0.000 n=18+18)
Memmove/7-6 1.02GB/s ± 0% 1.02GB/s ± 1% ~ (p=0.139 n=19+20)
Memmove/8-6 1.26GB/s ± 0% 1.26GB/s ± 0% +0.00% (p=0.000 n=18+18)
Memmove/9-6 1.42GB/s ± 0% 1.42GB/s ± 0% +0.00% (p=0.000 n=17+18)
Memmove/10-6 1.58GB/s ± 0% 1.58GB/s ± 0% +0.00% (p=0.000 n=19+19)
Memmove/11-6 1.74GB/s ± 0% 1.74GB/s ± 0% +0.00% (p=0.001 n=18+17)
Memmove/12-6 1.90GB/s ± 0% 1.90GB/s ± 0% +0.00% (p=0.000 n=19+19)
Memmove/13-6 2.05GB/s ± 0% 2.05GB/s ± 0% +0.00% (p=0.000 n=18+19)
Memmove/14-6 2.21GB/s ± 0% 2.21GB/s ± 0% +0.00% (p=0.000 n=16+20)
Memmove/15-6 2.37GB/s ± 0% 2.37GB/s ± 0% +0.00% (p=0.004 n=19+20)
Memmove/16-6 2.53GB/s ± 0% 2.53GB/s ± 0% +0.00% (p=0.000 n=16+16)
Memmove/32-6 4.67GB/s ± 0% 4.67GB/s ± 0% +0.00% (p=0.000 n=17+17)
Memmove/64-6 8.67GB/s ± 0% 8.64GB/s ± 0% -0.33% (p=0.000 n=18+17)
Memmove/128-6 12.6GB/s ± 0% 11.6GB/s ± 0% -8.05% (p=0.000 n=16+19)
Memmove/256-6 16.3GB/s ± 0% 16.6GB/s ± 0% +1.66% (p=0.000 n=20+18)
Memmove/512-6 21.5GB/s ± 0% 24.4GB/s ± 0% +13.35% (p=0.000 n=18+17)
Memmove/1024-6 24.7GB/s ± 0% 33.7GB/s ± 0% +36.12% (p=0.000 n=18+18)
Memmove/2048-6 27.3GB/s ± 0% 43.3GB/s ± 0% +58.77% (p=0.000 n=19+17)
Memmove/4096-6 37.5GB/s ± 0% 50.5GB/s ± 0% +34.56% (p=0.000 n=19+19)
MemmoveUnalignedDst/1-6 135MB/s ± 0% 146MB/s ± 0% +7.69% (p=0.000 n=16+14)
MemmoveUnalignedDst/2-6 271MB/s ± 0% 292MB/s ± 0% +7.69% (p=0.000 n=18+18)
MemmoveUnalignedDst/3-6 438MB/s ± 0% 438MB/s ± 0% ~ (p=0.352 n=16+19)
MemmoveUnalignedDst/4-6 584MB/s ± 0% 584MB/s ± 0% ~ (p=0.876 n=17+17)
MemmoveUnalignedDst/5-6 631MB/s ± 1% 632MB/s ± 0% +0.25% (p=0.000 n=20+17)
MemmoveUnalignedDst/6-6 759MB/s ± 0% 759MB/s ± 0% +0.00% (p=0.000 n=19+16)
MemmoveUnalignedDst/7-6 885MB/s ± 0% 883MB/s ± 1% ~ (p=0.647 n=18+20)
MemmoveUnalignedDst/8-6 1.08GB/s ± 0% 1.08GB/s ± 0% +0.00% (p=0.035 n=19+18)
MemmoveUnalignedDst/9-6 1.22GB/s ± 0% 1.22GB/s ± 0% ~ (p=0.251 n=18+17)
MemmoveUnalignedDst/10-6 1.35GB/s ± 0% 1.35GB/s ± 0% ~ (p=0.327 n=17+18)
MemmoveUnalignedDst/11-6 1.49GB/s ± 0% 1.49GB/s ± 0% ~ (p=0.531 n=18+19)
MemmoveUnalignedDst/12-6 1.63GB/s ± 0% 1.63GB/s ± 0% ~ (p=0.886 n=19+18)
MemmoveUnalignedDst/13-6 1.76GB/s ± 0% 1.76GB/s ± 1% -0.24% (p=0.006 n=18+20)
MemmoveUnalignedDst/14-6 1.90GB/s ± 0% 1.90GB/s ± 0% ~ (p=0.818 n=20+19)
MemmoveUnalignedDst/15-6 2.03GB/s ± 0% 2.03GB/s ± 0% ~ (p=0.294 n=17+16)
MemmoveUnalignedDst/16-6 2.17GB/s ± 0% 2.17GB/s ± 0% ~ (p=0.602 n=16+18)
MemmoveUnalignedDst/32-6 4.05GB/s ± 0% 4.05GB/s ± 0% +0.00% (p=0.010 n=18+17)
MemmoveUnalignedDst/64-6 7.59GB/s ± 0% 7.59GB/s ± 0% +0.00% (p=0.022 n=18+16)
MemmoveUnalignedDst/128-6 11.1GB/s ± 0% 11.4GB/s ± 0% +2.79% (p=0.000 n=18+17)
MemmoveUnalignedDst/256-6 16.4GB/s ± 0% 16.7GB/s ± 0% +1.59% (p=0.000 n=20+17)
MemmoveUnalignedDst/512-6 15.7GB/s ± 0% 21.3GB/s ± 0% +35.87% (p=0.000 n=18+20)
MemmoveUnalignedDst/1024-6 16.0GB/s ±20% 31.5GB/s ± 0% +96.93% (p=0.000 n=20+14)
MemmoveUnalignedDst/2048-6 19.6GB/s ± 0% 42.1GB/s ± 0% +115.16% (p=0.000 n=17+18)
MemmoveUnalignedDst/4096-6 6.41GB/s ± 0% 33.18GB/s ± 0% +417.56% (p=0.000 n=17+18)
MemmoveUnalignedSrc/1-6 171MB/s ± 0% 166MB/s ± 0% -3.33% (p=0.000 n=19+16)
MemmoveUnalignedSrc/2-6 343MB/s ± 0% 342MB/s ± 1% -0.41% (p=0.000 n=17+20)
MemmoveUnalignedSrc/3-6 508MB/s ± 0% 493MB/s ± 1% -2.90% (p=0.000 n=17+17)
MemmoveUnalignedSrc/4-6 677MB/s ± 0% 660MB/s ± 2% -2.55% (p=0.000 n=17+20)
MemmoveUnalignedSrc/5-6 790MB/s ± 0% 790MB/s ± 0% ~ (p=0.139 n=17+17)
MemmoveUnalignedSrc/6-6 948MB/s ± 0% 946MB/s ± 1% ~ (p=0.330 n=17+19)
MemmoveUnalignedSrc/7-6 1.11GB/s ± 0% 1.11GB/s ± 0% -0.05% (p=0.026 n=17+17)
MemmoveUnalignedSrc/8-6 1.38GB/s ± 0% 1.38GB/s ± 0% ~ (p=0.091 n=18+16)
MemmoveUnalignedSrc/9-6 1.42GB/s ± 0% 1.40GB/s ± 1% -1.04% (p=0.000 n=19+20)
MemmoveUnalignedSrc/10-6 1.58GB/s ± 0% 1.56GB/s ± 1% -1.15% (p=0.000 n=18+19)
MemmoveUnalignedSrc/11-6 1.73GB/s ± 0% 1.71GB/s ± 1% -1.30% (p=0.000 n=20+20)
MemmoveUnalignedSrc/12-6 1.89GB/s ± 0% 1.87GB/s ± 1% -1.18% (p=0.000 n=17+20)
MemmoveUnalignedSrc/13-6 2.05GB/s ± 0% 2.02GB/s ± 1% -1.18% (p=0.000 n=17+20)
MemmoveUnalignedSrc/14-6 2.21GB/s ± 0% 2.18GB/s ± 1% -1.14% (p=0.000 n=17+20)
MemmoveUnalignedSrc/15-6 2.36GB/s ± 0% 2.34GB/s ± 1% -1.04% (p=0.000 n=17+20)
MemmoveUnalignedSrc/16-6 2.52GB/s ± 0% 2.49GB/s ± 1% -1.26% (p=0.000 n=19+20)
MemmoveUnalignedSrc/32-6 4.82GB/s ± 0% 4.61GB/s ± 0% -4.40% (p=0.000 n=19+20)
MemmoveUnalignedSrc/64-6 5.03GB/s ± 4% 7.97GB/s ± 0% +58.55% (p=0.000 n=20+16)
MemmoveUnalignedSrc/128-6 11.1GB/s ± 0% 11.2GB/s ± 0% +0.52% (p=0.000 n=17+18)
MemmoveUnalignedSrc/256-6 16.5GB/s ± 0% 16.4GB/s ± 0% -0.10% (p=0.000 n=20+18)
MemmoveUnalignedSrc/512-6 21.0GB/s ± 0% 22.1GB/s ± 0% +5.48% (p=0.000 n=14+17)
MemmoveUnalignedSrc/1024-6 24.9GB/s ± 0% 31.9GB/s ± 0% +28.20% (p=0.000 n=19+20)
MemmoveUnalignedSrc/2048-6 23.3GB/s ± 0% 33.8GB/s ± 0% +45.22% (p=0.000 n=17+19)
MemmoveUnalignedSrc/4096-6 37.3GB/s ± 0% 42.7GB/s ± 0% +14.30% (p=0.000 n=17+17)
Change-Id: Id66aa3e499ccfb117cb99d623ef326b50d057b64
Reviewed-on: https://go-review.googlesource.com/29590
Run-TryBot: Denis Nagorny <denis.nagorny@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
--- /dev/null
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+var vendorStringBytes [12]byte
+var maxInputValue uint32
+var featureFlags uint32
+var processorVersionInfo uint32
+
+var useRepMovs = true
+
+func hasFeature(feature uint32) bool {
+ return (featureFlags & feature) != 0
+}
+
+func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32) // implemented in cpuidlow_amd64.s
+func xgetbv_low(arg1 uint32) (eax, edx uint32) // implemented in cpuidlow_amd64.s
+
+func init() {
+ const cfOSXSAVE uint32 = 1 << 27
+ const cfAVX uint32 = 1 << 28
+
+ leaf0()
+ leaf1()
+
+ enabledAVX := false
+ // Let's check if OS has set CR4.OSXSAVE[bit 18]
+ // to enable XGETBV instruction.
+ if hasFeature(cfOSXSAVE) {
+ eax, _ := xgetbv_low(0)
+ // Let's check that XCR0[2:1] = ‘11b’
+ // i.e. XMM state and YMM state are enabled by OS.
+ enabledAVX = (eax & 0x6) == 0x6
+ }
+
+ isIntelBridgeFamily := (processorVersionInfo == 0x206A0 ||
+ processorVersionInfo == 0x206D0 ||
+ processorVersionInfo == 0x306A0 ||
+ processorVersionInfo == 0x306E0) &&
+ isIntel()
+
+ useRepMovs = !(hasFeature(cfAVX) && enabledAVX) || isIntelBridgeFamily
+}
+
+func leaf0() {
+ eax, ebx, ecx, edx := cpuid_low(0, 0)
+ maxInputValue = eax
+ int32ToBytes(ebx, vendorStringBytes[0:4])
+ int32ToBytes(edx, vendorStringBytes[4:8])
+ int32ToBytes(ecx, vendorStringBytes[8:12])
+}
+
+func leaf1() {
+ if maxInputValue < 1 {
+ return
+ }
+ eax, _, ecx, _ := cpuid_low(1, 0)
+ // Let's remove stepping and reserved fields
+ processorVersionInfo = eax & 0x0FFF3FF0
+ featureFlags = ecx
+}
+
+func int32ToBytes(arg uint32, buffer []byte) {
+ buffer[3] = byte(arg >> 24)
+ buffer[2] = byte(arg >> 16)
+ buffer[1] = byte(arg >> 8)
+ buffer[0] = byte(arg)
+}
+
+func isIntel() bool {
+ intelSignature := [12]byte{'G', 'e', 'n', 'u', 'i', 'n', 'e', 'I', 'n', 't', 'e', 'l'}
+ return vendorStringBytes == intelSignature
+}
--- /dev/null
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid_low(SB), 4, $0-24
+ MOVL arg1+0(FP), AX
+ MOVL arg2+4(FP), CX
+ CPUID
+ MOVL AX, eax+8(FP)
+ MOVL BX, ebx+12(FP)
+ MOVL CX, ecx+16(FP)
+ MOVL DX, edx+20(FP)
+ RET
+// func xgetbv_low(arg1 uint32) (eax, edx uint32)
+TEXT ·xgetbv_low(SB), 4, $0-16
+ MOVL arg1+0(FP), CX
+ // XGETBV
+ BYTE $0x0F; BYTE $0x01; BYTE $0xD0
+ MOVL AX,eax+8(FP)
+ MOVL DX,edx+12(FP)
+ RET
JBE move_129through256
// TODO: use branch table and BSR to make this just a single dispatch
+ TESTB $1, runtime·useRepMovs(SB)
+ JZ avxUnaligned
+
/*
* check and set for backwards
*/
ADDQ BX, CX
CMPQ CX, DI
JLS forward
-
/*
* whole thing backwards has
* adjusted addresses
LEAQ 256(DI), DI
JGE move_256through2048
JMP tail
+
+avxUnaligned:
+ // There are two implementations of move algorithm.
+ // The first one for non-ovelapped memory regions. It uses forward copying.
+ // The second one for overlapped regions. It uses backward copying
+ MOVQ DI, CX
+ SUBQ SI, CX
+ // Now CX contains distance between SRC and DEST
+ CMPQ CX, BX
+ // If the distance lesser than region length it means that regions are overlapped
+ JC copy_backward
+
+ // Non-temporal copy would be better for big sizes.
+ CMPQ BX, $0x100000
+ JAE gobble_big_data_fwd
+
+ // Memory layout on the source side
+ // SI CX
+ // |<---------BX before correction--------->|
+ // | |<--BX corrected-->| |
+ // | | |<--- AX --->|
+ // |<-R11->| |<-128 bytes->|
+ // +----------------------------------------+
+ // | Head | Body | Tail |
+ // +-------+------------------+-------------+
+ // ^ ^ ^
+ // | | |
+ // Save head into Y4 Save tail into X5..X12
+ // |
+ // SI+R11, where R11 = ((DI & -32) + 32) - DI
+ // Algorithm:
+ // 1. Unaligned save of the tail's 128 bytes
+ // 2. Unaligned save of the head's 32 bytes
+ // 3. Destination-aligned copying of body (128 bytes per iteration)
+ // 4. Put head on the new place
+ // 5. Put the tail on the new place
+ // It can be important to satisfy processor's pipeline requirements for
+ // small sizes as the cost of unaligned memory region copying is
+ // comparable with the cost of main loop. So code is slightly messed there.
+ // There is more clean implementation of that algorithm for bigger sizes
+ // where the cost of unaligned part copying is negligible.
+ // You can see it after gobble_big_data_fwd label.
+ LEAQ (SI)(BX*1), CX
+ MOVQ DI, R10
+ // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
+ MOVOU -0x80(CX), X5
+ MOVOU -0x70(CX), X6
+ MOVQ $0x80, AX
+ // Align destination address
+ ANDQ $-32, DI
+ ADDQ $32, DI
+ // Continue tail saving.
+ MOVOU -0x60(CX), X7
+ MOVOU -0x50(CX), X8
+ // Make R11 delta between aligned and unaligned destination addresses.
+ MOVQ DI, R11
+ SUBQ R10, R11
+ // Continue tail saving.
+ MOVOU -0x40(CX), X9
+ MOVOU -0x30(CX), X10
+ // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
+ SUBQ R11, BX
+ // Continue tail saving.
+ MOVOU -0x20(CX), X11
+ MOVOU -0x10(CX), X12
+ // The tail will be put on it's place after main body copying.
+ // It's time for the unaligned heading part.
+ VMOVDQU (SI), Y4
+ // Adjust source address to point past head.
+ ADDQ R11, SI
+ SUBQ AX, BX
+ // Aligned memory copying there
+gobble_128_loop:
+ VMOVDQU (SI), Y0
+ VMOVDQU 0x20(SI), Y1
+ VMOVDQU 0x40(SI), Y2
+ VMOVDQU 0x60(SI), Y3
+ ADDQ AX, SI
+ VMOVDQA Y0, (DI)
+ VMOVDQA Y1, 0x20(DI)
+ VMOVDQA Y2, 0x40(DI)
+ VMOVDQA Y3, 0x60(DI)
+ ADDQ AX, DI
+ SUBQ AX, BX
+ JA gobble_128_loop
+ // Now we can store unaligned parts.
+ ADDQ AX, BX
+ ADDQ DI, BX
+ VMOVDQU Y4, (R10)
+ VZEROUPPER
+ MOVOU X5, -0x80(BX)
+ MOVOU X6, -0x70(BX)
+ MOVOU X7, -0x60(BX)
+ MOVOU X8, -0x50(BX)
+ MOVOU X9, -0x40(BX)
+ MOVOU X10, -0x30(BX)
+ MOVOU X11, -0x20(BX)
+ MOVOU X12, -0x10(BX)
+ RET
+
+gobble_big_data_fwd:
+ // There is forward copying for big regions.
+ // It uses non-temporal mov instructions.
+ // Details of this algorithm are commented previously for small sizes.
+ LEAQ (SI)(BX*1), CX
+ MOVOU -0x80(SI)(BX*1), X5
+ MOVOU -0x70(CX), X6
+ MOVOU -0x60(CX), X7
+ MOVOU -0x50(CX), X8
+ MOVOU -0x40(CX), X9
+ MOVOU -0x30(CX), X10
+ MOVOU -0x20(CX), X11
+ MOVOU -0x10(CX), X12
+ VMOVDQU (SI), Y4
+ MOVQ DI, R8
+ ANDQ $-32, DI
+ ADDQ $32, DI
+ MOVQ DI, R10
+ SUBQ R8, R10
+ SUBQ R10, BX
+ ADDQ R10, SI
+ LEAQ (DI)(BX*1), CX
+ SUBQ $0x80, BX
+gobble_mem_fwd_loop:
+ PREFETCHNTA 0x1C0(SI)
+ PREFETCHNTA 0x280(SI)
+ // Prefetch values were choosen empirically.
+ // Approach for prefetch usage as in 7.6.6 of [1]
+ // [1] 64-ia-32-architectures-optimization-manual.pdf
+ // http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+ VMOVDQU (SI), Y0
+ VMOVDQU 0x20(SI), Y1
+ VMOVDQU 0x40(SI), Y2
+ VMOVDQU 0x60(SI), Y3
+ ADDQ $0x80, SI
+ VMOVNTDQ Y0, (DI)
+ VMOVNTDQ Y1, 0x20(DI)
+ VMOVNTDQ Y2, 0x40(DI)
+ VMOVNTDQ Y3, 0x60(DI)
+ ADDQ $0x80, DI
+ SUBQ $0x80, BX
+ JA gobble_mem_fwd_loop
+ // NT instructions don't follow the normal cache-coherency rules.
+ // We need SFENCE there to make copied data available timely.
+ SFENCE
+ VMOVDQU Y4, (R8)
+ VZEROUPPER
+ MOVOU X5, -0x80(CX)
+ MOVOU X6, -0x70(CX)
+ MOVOU X7, -0x60(CX)
+ MOVOU X8, -0x50(CX)
+ MOVOU X9, -0x40(CX)
+ MOVOU X10, -0x30(CX)
+ MOVOU X11, -0x20(CX)
+ MOVOU X12, -0x10(CX)
+ RET
+
+copy_backward:
+ MOVQ DI, AX
+ // Backward copying is about the same as the forward one.
+ // Firstly we load unaligned tail in the beginning of region.
+ MOVOU (SI), X5
+ MOVOU 0x10(SI), X6
+ ADDQ BX, DI
+ MOVOU 0x20(SI), X7
+ MOVOU 0x30(SI), X8
+ LEAQ -0x20(DI), R10
+ MOVQ DI, R11
+ MOVOU 0x40(SI), X9
+ MOVOU 0x50(SI), X10
+ ANDQ $0x1F, R11
+ MOVOU 0x60(SI), X11
+ MOVOU 0x70(SI), X12
+ XORQ R11, DI
+ // Let's point SI to the end of region
+ ADDQ BX, SI
+ // and load unaligned head into X4.
+ VMOVDQU -0x20(SI), Y4
+ SUBQ R11, SI
+ SUBQ R11, BX
+ // If there is enough data for non-temporal moves go to special loop
+ CMPQ BX, $0x100000
+ JA gobble_big_data_bwd
+ SUBQ $0x80, BX
+gobble_mem_bwd_loop:
+ VMOVDQU -0x20(SI), Y0
+ VMOVDQU -0x40(SI), Y1
+ VMOVDQU -0x60(SI), Y2
+ VMOVDQU -0x80(SI), Y3
+ SUBQ $0x80, SI
+ VMOVDQA Y0, -0x20(DI)
+ VMOVDQA Y1, -0x40(DI)
+ VMOVDQA Y2, -0x60(DI)
+ VMOVDQA Y3, -0x80(DI)
+ SUBQ $0x80, DI
+ SUBQ $0x80, BX
+ JA gobble_mem_bwd_loop
+ // Let's store unaligned data
+ VMOVDQU Y4, (R10)
+ VZEROUPPER
+ MOVOU X5, (AX)
+ MOVOU X6, 0x10(AX)
+ MOVOU X7, 0x20(AX)
+ MOVOU X8, 0x30(AX)
+ MOVOU X9, 0x40(AX)
+ MOVOU X10, 0x50(AX)
+ MOVOU X11, 0x60(AX)
+ MOVOU X12, 0x70(AX)
+ RET
+
+gobble_big_data_bwd:
+ SUBQ $0x80, BX
+gobble_big_mem_bwd_loop:
+ PREFETCHNTA -0x1C0(SI)
+ PREFETCHNTA -0x280(SI)
+ VMOVDQU -0x20(SI), Y0
+ VMOVDQU -0x40(SI), Y1
+ VMOVDQU -0x60(SI), Y2
+ VMOVDQU -0x80(SI), Y3
+ SUBQ $0x80, SI
+ VMOVNTDQ Y0, -0x20(DI)
+ VMOVNTDQ Y1, -0x40(DI)
+ VMOVNTDQ Y2, -0x60(DI)
+ VMOVNTDQ Y3, -0x80(DI)
+ SUBQ $0x80, DI
+ SUBQ $0x80, BX
+ JA gobble_big_mem_bwd_loop
+ SFENCE
+ VMOVDQU Y4, (R10)
+ VZEROUPPER
+ MOVOU X5, (AX)
+ MOVOU X6, 0x10(AX)
+ MOVOU X7, 0x20(AX)
+ MOVOU X8, 0x30(AX)
+ MOVOU X9, 0x40(AX)
+ MOVOU X10, 0x50(AX)
+ MOVOU X11, 0x60(AX)
+ MOVOU X12, 0x70(AX)
+ RET
package runtime_test
import (
+ "crypto/rand"
"fmt"
+ "internal/race"
. "runtime"
"testing"
)
}
}
+func TestMemmoveLarge0x180000(t *testing.T) {
+ if race.Enabled {
+ t.Skip("skipping large memmove test under race detector")
+ }
+ testSize(t, 0x180000)
+}
+
+func TestMemmoveOverlapLarge0x120000(t *testing.T) {
+ if race.Enabled {
+ t.Skip("skipping large memmove test under race detector")
+ }
+ testOverlap(t, 0x120000)
+}
+
+func testSize(t *testing.T, size int) {
+ src := make([]byte, size)
+ dst := make([]byte, size)
+ _, _ = rand.Read(src)
+ _, _ = rand.Read(dst)
+
+ ref := make([]byte, size)
+ copyref(ref, dst)
+
+ for n := size - 50; n > 1; n >>= 1 {
+ for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+ for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+ copy(dst[y:y+n], src[x:x+n])
+ copyref(ref[y:y+n], src[x:x+n])
+ p := cmpb(dst, ref)
+ if p >= 0 {
+ t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
+ }
+ }
+ }
+ }
+}
+
+func testOverlap(t *testing.T, size int) {
+ src := make([]byte, size)
+ test := make([]byte, size)
+ ref := make([]byte, size)
+ _, _ = rand.Read(src)
+
+ for n := size - 50; n > 1; n >>= 1 {
+ for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+ for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+ // Reset input
+ copyref(test, src)
+ copyref(ref, src)
+ copy(test[y:y+n], test[x:x+n])
+ if y <= x {
+ copyref(ref[y:y+n], ref[x:x+n])
+ } else {
+ copybw(ref[y:y+n], ref[x:x+n])
+ }
+ p := cmpb(test, ref)
+ if p >= 0 {
+ t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
+ }
+ }
+ }
+ }
+
+}
+
+// Forward copy.
+func copyref(dst, src []byte) {
+ for i, v := range src {
+ dst[i] = v
+ }
+}
+
+// Backwards copy
+func copybw(dst, src []byte) {
+ if len(src) == 0 {
+ return
+ }
+ for i := len(src) - 1; i >= 0; i-- {
+ dst[i] = src[i]
+ }
+}
+
+// Returns offset of difference
+func matchLen(a, b []byte, max int) int {
+ a = a[:max]
+ b = b[:max]
+ for i, av := range a {
+ if b[i] != av {
+ return i
+ }
+ }
+ return max
+}
+
+func cmpb(a, b []byte) int {
+ l := matchLen(a, b, len(a))
+ if l == len(a) {
+ return -1
+ }
+ return l
+}
+
func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
for _, n := range sizes {
b.Run(fmt.Sprint(n), func(b *testing.B) {