+++ /dev/null
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-var vendorStringBytes [12]byte
-var maxInputValue uint32
-var featureFlags uint32
-var processorVersionInfo uint32
-
-var useRepMovs bool
-
-func hasFeature(feature uint32) bool {
- return (featureFlags & feature) != 0
-}
-
-func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32) // implemented in cpuidlow_amd64.s
-func xgetbv_low(arg1 uint32) (eax, edx uint32) // implemented in cpuidlow_amd64.s
-
-func init() {
- const cfOSXSAVE uint32 = 1 << 27
- const cfAVX uint32 = 1 << 28
-
- leaf0()
- leaf1()
-
- enabledAVX := false
- // Let's check if OS has set CR4.OSXSAVE[bit 18]
- // to enable XGETBV instruction.
- if hasFeature(cfOSXSAVE) {
- eax, _ := xgetbv_low(0)
- // Let's check that XCR0[2:1] = ‘11b’
- // i.e. XMM state and YMM state are enabled by OS.
- enabledAVX = (eax & 0x6) == 0x6
- }
-
- isIntelBridgeFamily := (processorVersionInfo == 0x206A0 ||
- processorVersionInfo == 0x206D0 ||
- processorVersionInfo == 0x306A0 ||
- processorVersionInfo == 0x306E0) &&
- isIntel()
-
- useRepMovs = !(hasFeature(cfAVX) && enabledAVX) || isIntelBridgeFamily
-}
-
-func leaf0() {
- eax, ebx, ecx, edx := cpuid_low(0, 0)
- maxInputValue = eax
- int32ToBytes(ebx, vendorStringBytes[0:4])
- int32ToBytes(edx, vendorStringBytes[4:8])
- int32ToBytes(ecx, vendorStringBytes[8:12])
-}
-
-func leaf1() {
- if maxInputValue < 1 {
- return
- }
- eax, _, ecx, _ := cpuid_low(1, 0)
- // Let's remove stepping and reserved fields
- processorVersionInfo = eax & 0x0FFF3FF0
- featureFlags = ecx
-}
-
-func int32ToBytes(arg uint32, buffer []byte) {
- buffer[3] = byte(arg >> 24)
- buffer[2] = byte(arg >> 16)
- buffer[1] = byte(arg >> 8)
- buffer[0] = byte(arg)
-}
-
-func isIntel() bool {
- intelSignature := [12]byte{'G', 'e', 'n', 'u', 'i', 'n', 'e', 'I', 'n', 't', 'e', 'l'}
- return vendorStringBytes == intelSignature
-}
+++ /dev/null
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·cpuid_low(SB), 4, $0-24
- MOVL arg1+0(FP), AX
- MOVL arg2+4(FP), CX
- CPUID
- MOVL AX, eax+8(FP)
- MOVL BX, ebx+12(FP)
- MOVL CX, ecx+16(FP)
- MOVL DX, edx+20(FP)
- RET
-// func xgetbv_low(arg1 uint32) (eax, edx uint32)
-TEXT ·xgetbv_low(SB), 4, $0-16
- MOVL arg1+0(FP), CX
- // XGETBV
- BYTE $0x0F; BYTE $0x01; BYTE $0xD0
- MOVL AX,eax+8(FP)
- MOVL DX,edx+12(FP)
- RET
JBE move_129through256
// TODO: use branch table and BSR to make this just a single dispatch
- TESTB $1, runtime·useRepMovs(SB)
- JZ avxUnaligned
-
/*
* check and set for backwards
*/
ADDQ BX, CX
CMPQ CX, DI
JLS forward
+
/*
* whole thing backwards has
* adjusted addresses
LEAQ 256(DI), DI
JGE move_256through2048
JMP tail
-
-avxUnaligned:
- // There are two implementations of the move algorithm.
- // The first one for non-overlapped memory regions. It uses forward copying.
- // The second one for overlapped regions. It uses backward copying
- MOVQ DI, CX
- SUBQ SI, CX
- // Now CX contains distance between SRC and DEST
- CMPQ CX, BX
- // If the distance lesser than region length it means that regions are overlapped
- JC copy_backward
-
- // Non-temporal copy would be better for big sizes.
- CMPQ BX, $0x100000
- JAE gobble_big_data_fwd
-
- // Memory layout on the source side
- // SI CX
- // |<---------BX before correction--------->|
- // | |<--BX corrected-->| |
- // | | |<--- AX --->|
- // |<-R11->| |<-128 bytes->|
- // +----------------------------------------+
- // | Head | Body | Tail |
- // +-------+------------------+-------------+
- // ^ ^ ^
- // | | |
- // Save head into Y4 Save tail into X5..X12
- // |
- // SI+R11, where R11 = ((DI & -32) + 32) - DI
- // Algorithm:
- // 1. Unaligned save of the tail's 128 bytes
- // 2. Unaligned save of the head's 32 bytes
- // 3. Destination-aligned copying of body (128 bytes per iteration)
- // 4. Put head on the new place
- // 5. Put the tail on the new place
- // It can be important to satisfy processor's pipeline requirements for
- // small sizes as the cost of unaligned memory region copying is
- // comparable with the cost of main loop. So code is slightly messed there.
- // There is more clean implementation of that algorithm for bigger sizes
- // where the cost of unaligned part copying is negligible.
- // You can see it after gobble_big_data_fwd label.
- LEAQ (SI)(BX*1), CX
- MOVQ DI, R10
- // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
- MOVOU -0x80(CX), X5
- MOVOU -0x70(CX), X6
- MOVQ $0x80, AX
- // Align destination address
- ANDQ $-32, DI
- ADDQ $32, DI
- // Continue tail saving.
- MOVOU -0x60(CX), X7
- MOVOU -0x50(CX), X8
- // Make R11 delta between aligned and unaligned destination addresses.
- MOVQ DI, R11
- SUBQ R10, R11
- // Continue tail saving.
- MOVOU -0x40(CX), X9
- MOVOU -0x30(CX), X10
- // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
- SUBQ R11, BX
- // Continue tail saving.
- MOVOU -0x20(CX), X11
- MOVOU -0x10(CX), X12
- // The tail will be put on it's place after main body copying.
- // It's time for the unaligned heading part.
- VMOVDQU (SI), Y4
- // Adjust source address to point past head.
- ADDQ R11, SI
- SUBQ AX, BX
- // Aligned memory copying there
-gobble_128_loop:
- VMOVDQU (SI), Y0
- VMOVDQU 0x20(SI), Y1
- VMOVDQU 0x40(SI), Y2
- VMOVDQU 0x60(SI), Y3
- ADDQ AX, SI
- VMOVDQA Y0, (DI)
- VMOVDQA Y1, 0x20(DI)
- VMOVDQA Y2, 0x40(DI)
- VMOVDQA Y3, 0x60(DI)
- ADDQ AX, DI
- SUBQ AX, BX
- JA gobble_128_loop
- // Now we can store unaligned parts.
- ADDQ AX, BX
- ADDQ DI, BX
- VMOVDQU Y4, (R10)
- VZEROUPPER
- MOVOU X5, -0x80(BX)
- MOVOU X6, -0x70(BX)
- MOVOU X7, -0x60(BX)
- MOVOU X8, -0x50(BX)
- MOVOU X9, -0x40(BX)
- MOVOU X10, -0x30(BX)
- MOVOU X11, -0x20(BX)
- MOVOU X12, -0x10(BX)
- RET
-
-gobble_big_data_fwd:
- // There is forward copying for big regions.
- // It uses non-temporal mov instructions.
- // Details of this algorithm are commented previously for small sizes.
- LEAQ (SI)(BX*1), CX
- MOVOU -0x80(SI)(BX*1), X5
- MOVOU -0x70(CX), X6
- MOVOU -0x60(CX), X7
- MOVOU -0x50(CX), X8
- MOVOU -0x40(CX), X9
- MOVOU -0x30(CX), X10
- MOVOU -0x20(CX), X11
- MOVOU -0x10(CX), X12
- VMOVDQU (SI), Y4
- MOVQ DI, R8
- ANDQ $-32, DI
- ADDQ $32, DI
- MOVQ DI, R10
- SUBQ R8, R10
- SUBQ R10, BX
- ADDQ R10, SI
- LEAQ (DI)(BX*1), CX
- SUBQ $0x80, BX
-gobble_mem_fwd_loop:
- PREFETCHNTA 0x1C0(SI)
- PREFETCHNTA 0x280(SI)
- // Prefetch values were choosen empirically.
- // Approach for prefetch usage as in 7.6.6 of [1]
- // [1] 64-ia-32-architectures-optimization-manual.pdf
- // http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
- VMOVDQU (SI), Y0
- VMOVDQU 0x20(SI), Y1
- VMOVDQU 0x40(SI), Y2
- VMOVDQU 0x60(SI), Y3
- ADDQ $0x80, SI
- VMOVNTDQ Y0, (DI)
- VMOVNTDQ Y1, 0x20(DI)
- VMOVNTDQ Y2, 0x40(DI)
- VMOVNTDQ Y3, 0x60(DI)
- ADDQ $0x80, DI
- SUBQ $0x80, BX
- JA gobble_mem_fwd_loop
- // NT instructions don't follow the normal cache-coherency rules.
- // We need SFENCE there to make copied data available timely.
- SFENCE
- VMOVDQU Y4, (R8)
- VZEROUPPER
- MOVOU X5, -0x80(CX)
- MOVOU X6, -0x70(CX)
- MOVOU X7, -0x60(CX)
- MOVOU X8, -0x50(CX)
- MOVOU X9, -0x40(CX)
- MOVOU X10, -0x30(CX)
- MOVOU X11, -0x20(CX)
- MOVOU X12, -0x10(CX)
- RET
-
-copy_backward:
- MOVQ DI, AX
- // Backward copying is about the same as the forward one.
- // Firstly we load unaligned tail in the beginning of region.
- MOVOU (SI), X5
- MOVOU 0x10(SI), X6
- ADDQ BX, DI
- MOVOU 0x20(SI), X7
- MOVOU 0x30(SI), X8
- LEAQ -0x20(DI), R10
- MOVQ DI, R11
- MOVOU 0x40(SI), X9
- MOVOU 0x50(SI), X10
- ANDQ $0x1F, R11
- MOVOU 0x60(SI), X11
- MOVOU 0x70(SI), X12
- XORQ R11, DI
- // Let's point SI to the end of region
- ADDQ BX, SI
- // and load unaligned head into X4.
- VMOVDQU -0x20(SI), Y4
- SUBQ R11, SI
- SUBQ R11, BX
- // If there is enough data for non-temporal moves go to special loop
- CMPQ BX, $0x100000
- JA gobble_big_data_bwd
- SUBQ $0x80, BX
-gobble_mem_bwd_loop:
- VMOVDQU -0x20(SI), Y0
- VMOVDQU -0x40(SI), Y1
- VMOVDQU -0x60(SI), Y2
- VMOVDQU -0x80(SI), Y3
- SUBQ $0x80, SI
- VMOVDQA Y0, -0x20(DI)
- VMOVDQA Y1, -0x40(DI)
- VMOVDQA Y2, -0x60(DI)
- VMOVDQA Y3, -0x80(DI)
- SUBQ $0x80, DI
- SUBQ $0x80, BX
- JA gobble_mem_bwd_loop
- // Let's store unaligned data
- VMOVDQU Y4, (R10)
- VZEROUPPER
- MOVOU X5, (AX)
- MOVOU X6, 0x10(AX)
- MOVOU X7, 0x20(AX)
- MOVOU X8, 0x30(AX)
- MOVOU X9, 0x40(AX)
- MOVOU X10, 0x50(AX)
- MOVOU X11, 0x60(AX)
- MOVOU X12, 0x70(AX)
- RET
-
-gobble_big_data_bwd:
- SUBQ $0x80, BX
-gobble_big_mem_bwd_loop:
- PREFETCHNTA -0x1C0(SI)
- PREFETCHNTA -0x280(SI)
- VMOVDQU -0x20(SI), Y0
- VMOVDQU -0x40(SI), Y1
- VMOVDQU -0x60(SI), Y2
- VMOVDQU -0x80(SI), Y3
- SUBQ $0x80, SI
- VMOVNTDQ Y0, -0x20(DI)
- VMOVNTDQ Y1, -0x40(DI)
- VMOVNTDQ Y2, -0x60(DI)
- VMOVNTDQ Y3, -0x80(DI)
- SUBQ $0x80, DI
- SUBQ $0x80, BX
- JA gobble_big_mem_bwd_loop
- SFENCE
- VMOVDQU Y4, (R10)
- VZEROUPPER
- MOVOU X5, (AX)
- MOVOU X6, 0x10(AX)
- MOVOU X7, 0x20(AX)
- MOVOU X8, 0x30(AX)
- MOVOU X9, 0x40(AX)
- MOVOU X10, 0x50(AX)
- MOVOU X11, 0x60(AX)
- MOVOU X12, 0x70(AX)
- RET
package runtime_test
import (
- "crypto/rand"
"fmt"
- "internal/race"
. "runtime"
"testing"
)
}
}
-func TestMemmoveLarge0x180000(t *testing.T) {
- if race.Enabled {
- t.Skip("skipping large memmove test under race detector")
- }
- testSize(t, 0x180000)
-}
-
-func TestMemmoveOverlapLarge0x120000(t *testing.T) {
- if race.Enabled {
- t.Skip("skipping large memmove test under race detector")
- }
- testOverlap(t, 0x120000)
-}
-
-func testSize(t *testing.T, size int) {
- src := make([]byte, size)
- dst := make([]byte, size)
- _, _ = rand.Read(src)
- _, _ = rand.Read(dst)
-
- ref := make([]byte, size)
- copyref(ref, dst)
-
- for n := size - 50; n > 1; n >>= 1 {
- for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
- for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
- copy(dst[y:y+n], src[x:x+n])
- copyref(ref[y:y+n], src[x:x+n])
- p := cmpb(dst, ref)
- if p >= 0 {
- t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
- }
- }
- }
- }
-}
-
-func testOverlap(t *testing.T, size int) {
- src := make([]byte, size)
- test := make([]byte, size)
- ref := make([]byte, size)
- _, _ = rand.Read(src)
-
- for n := size - 50; n > 1; n >>= 1 {
- for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
- for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
- // Reset input
- copyref(test, src)
- copyref(ref, src)
- copy(test[y:y+n], test[x:x+n])
- if y <= x {
- copyref(ref[y:y+n], ref[x:x+n])
- } else {
- copybw(ref[y:y+n], ref[x:x+n])
- }
- p := cmpb(test, ref)
- if p >= 0 {
- t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
- }
- }
- }
- }
-
-}
-
-// Forward copy.
-func copyref(dst, src []byte) {
- for i, v := range src {
- dst[i] = v
- }
-}
-
-// Backwards copy
-func copybw(dst, src []byte) {
- if len(src) == 0 {
- return
- }
- for i := len(src) - 1; i >= 0; i-- {
- dst[i] = src[i]
- }
-}
-
-// Returns offset of difference
-func matchLen(a, b []byte, max int) int {
- a = a[:max]
- b = b[:max]
- for i, av := range a {
- if b[i] != av {
- return i
- }
- }
- return max
-}
-
-func cmpb(a, b []byte) int {
- l := matchLen(a, b, len(a))
- if l == len(a) {
- return -1
- }
- return l
-}
-
func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
for _, n := range sizes {
b.Run(fmt.Sprint(n), func(b *testing.B) {