From c25e5c86b2da8117b2d5c934b368ecbcf8e2efd5 Mon Sep 17 00:00:00 2001 From: David Chase Date: Thu, 24 Jul 2025 10:31:46 -0400 Subject: [PATCH] [dev.simd] cmd/compile: generated code for K-mask-register slice load/stores plus slice-part load, store and test for a single type. Generated by arch/internal/simdgen CL 690315 Change-Id: I58052728b544c4a772a2870ac68f3c832813e1ea Reviewed-on: https://go-review.googlesource.com/c/go/+/690336 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- .../compile/internal/ssagen/simdintrinsics.go | 28 +++ src/simd/slicepart_amd64.go | 45 ++++ src/simd/slicepart_test.go | 47 ++++ src/simd/types_amd64.go | 232 ++++++++++++++++++ 4 files changed, 352 insertions(+) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index dddfab5b71..a30144cbd1 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -2148,26 +2148,54 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) addF(simdPackage, "LoadMaskedFloat32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) addF(simdPackage, "Float32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedFloat32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Float32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) addF(simdPackage, "LoadMaskedFloat64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) addF(simdPackage, "Float64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) addF(simdPackage, "LoadMaskedFloat64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) addF(simdPackage, "Float64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedFloat64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Float64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedInt8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64) + addF(simdPackage, "Int8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64) + addF(simdPackage, "LoadMaskedInt16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64) + addF(simdPackage, "Int16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64) addF(simdPackage, "LoadMaskedInt32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) addF(simdPackage, "Int32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) addF(simdPackage, "LoadMaskedInt32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) addF(simdPackage, "Int32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedInt32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Int32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) addF(simdPackage, "LoadMaskedInt64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) addF(simdPackage, "Int64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) addF(simdPackage, "LoadMaskedInt64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) addF(simdPackage, "Int64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedInt64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Int64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedUint8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64) + addF(simdPackage, "Uint8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64) + addF(simdPackage, "LoadMaskedUint16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64) + addF(simdPackage, "Uint16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64) addF(simdPackage, "LoadMaskedUint32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) addF(simdPackage, "Uint32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) addF(simdPackage, "LoadMaskedUint32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) addF(simdPackage, "Uint32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedUint32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Uint32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) addF(simdPackage, "LoadMaskedUint64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) addF(simdPackage, "Uint64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) addF(simdPackage, "LoadMaskedUint64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) addF(simdPackage, "Uint64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedUint64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Uint64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedMask8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64) + addF(simdPackage, "Mask8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64) + addF(simdPackage, "LoadMaskedMask16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64) + addF(simdPackage, "Mask16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64) + addF(simdPackage, "LoadMaskedMask32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Mask32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedMask64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Mask64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) addF(simdPackage, "Mask8x16.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x16.AsMask8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) diff --git a/src/simd/slicepart_amd64.go b/src/simd/slicepart_amd64.go index 00025775be..3fcfc6255b 100644 --- a/src/simd/slicepart_amd64.go +++ b/src/simd/slicepart_amd64.go @@ -419,6 +419,24 @@ func paInt64x4(s []int64) *[4]int64 { return (*[4]int64)(unsafe.Pointer(&s[0])) } +// For 512-bit masked loads/stores + +func paInt64x8(s []int64) *[8]int64 { + return (*[8]int64)(unsafe.Pointer(&s[0])) +} + +func paInt32x16(s []int32) *[16]int32 { + return (*[16]int32)(unsafe.Pointer(&s[0])) +} + +func paInt16x32(s []int16) *[32]int16 { + return (*[32]int16)(unsafe.Pointer(&s[0])) +} + +func paInt8x64(s []int8) *[64]int8 { + return (*[64]int8)(unsafe.Pointer(&s[0])) +} + /* 32 and 64-bit slice-part loads for AVX2 (128 and 256 bit) */ // LoadInt32x4SlicePart loads a Int32x4 from the slice s. @@ -742,3 +760,30 @@ func (x Float64x4) StoreSlicePart(s []float64) { t := unsafe.Slice((*int64)(unsafe.Pointer(&s[0])), len(s)) x.AsInt64x4().StoreSlicePart(t) } + +func LoadInt64x8SlicePart(s []int64) Int64x8 { + l := len(s) + if l >= 8 { + return LoadInt64x8Slice(s) + } + if l == 0 { + var x Int64x8 + return x + } + + mask := Mask64x8FromBits(0xff >> (8 - l)) + return LoadMaskedInt64x8(paInt64x8(s), mask) +} + +func (x Int64x8) StoreSlicePart(s []int64) { + l := len(s) + if l >= 8 { + x.StoreSlice(s) + return + } + if l == 0 { + return + } + mask := Mask64x8FromBits(0xff >> (8 - l)) + x.StoreMasked(paInt64x8(s), mask) +} diff --git a/src/simd/slicepart_test.go b/src/simd/slicepart_test.go index cfdb7581d9..c9492bea1b 100644 --- a/src/simd/slicepart_test.go +++ b/src/simd/slicepart_test.go @@ -341,3 +341,50 @@ func TestSlicePartFloat32(t *testing.T) { } } } + +// 512-bit load + +func TestSlicePartInt64(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + + L := 8 + c := []int64{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86} + a := c[:L+1] + for i := range a { + // Test the load first + // e is a partial slice. + e := a[i:] + v := simd.LoadInt64x8SlicePart(e) + // d contains what a ought to contain + d := make([]int64, L) + for j := 0; j < len(e) && j < len(d); j++ { + d[j] = e[j] + } + + b := make([]int64, L) + v.StoreSlice(b) + // test the load + checkSlicesLogInput(t, b, d, func() { t.Helper(); t.Logf("Len(e)=%d", len(e)) }) + + // Test the store + f := make([]int64, L+1) + for i := range f { + f[i] = 99 + } + + v.StoreSlicePart(f[:len(e)]) + if len(e) < len(b) { + checkSlices(t, f, b[:len(e)]) + } else { + checkSlices(t, f, b) + } + for i := len(e); i < len(f); i++ { + if f[i] != 99 { + t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i]) + } + } + } +} diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go index 252da021e2..ac8cf3c210 100644 --- a/src/simd/types_amd64.go +++ b/src/simd/types_amd64.go @@ -31,12 +31,16 @@ func (x Float32x4) Store(y *[4]float32) // LoadMaskedFloat32x4 loads a Float32x4 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4 // StoreMasked stores a Float32x4 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4) @@ -62,12 +66,16 @@ func (x Float64x2) Store(y *[2]float64) // LoadMaskedFloat64x2 loads a Float64x2 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2 // StoreMasked stores a Float64x2 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2) @@ -131,12 +139,16 @@ func (x Int32x4) Store(y *[4]int32) // LoadMaskedInt32x4 loads a Int32x4 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4 // StoreMasked stores a Int32x4 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4) @@ -162,12 +174,16 @@ func (x Int64x2) Store(y *[2]int64) // LoadMaskedInt64x2 loads a Int64x2 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2 // StoreMasked stores a Int64x2 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2) @@ -231,12 +247,16 @@ func (x Uint32x4) Store(y *[4]uint32) // LoadMaskedUint32x4 loads a Uint32x4 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4 // StoreMasked stores a Uint32x4 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4) @@ -262,12 +282,16 @@ func (x Uint64x2) Store(y *[2]uint64) // LoadMaskedUint64x2 loads a Uint64x2 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2 // StoreMasked stores a Uint64x2 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2) @@ -295,6 +319,8 @@ func (x Mask8x16) StoreToBits(y *uint64) // Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. +// +// Asm: KMOVB, CPU Feature: AVX512" func Mask8x16FromBits(y uint16) Mask8x16 // Mask16x8 is a 128-bit SIMD vector of 8 int16 @@ -321,6 +347,8 @@ func (x Mask16x8) StoreToBits(y *uint64) // Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. +// +// Asm: KMOVW, CPU Feature: AVX512" func Mask16x8FromBits(y uint8) Mask16x8 // Mask32x4 is a 128-bit SIMD vector of 4 int32 @@ -347,6 +375,8 @@ func (x Mask32x4) StoreToBits(y *uint64) // Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. +// +// Asm: KMOVD, CPU Feature: AVX512" func Mask32x4FromBits(y uint8) Mask32x4 // Mask64x2 is a 128-bit SIMD vector of 2 int64 @@ -373,6 +403,8 @@ func (x Mask64x2) StoreToBits(y *uint64) // Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 2 bits of y are used. +// +// Asm: KMOVQ, CPU Feature: AVX512" func Mask64x2FromBits(y uint8) Mask64x2 // v256 is a tag type that tells the compiler that this is really 256-bit SIMD @@ -402,12 +434,16 @@ func (x Float32x8) Store(y *[8]float32) // LoadMaskedFloat32x8 loads a Float32x8 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8 // StoreMasked stores a Float32x8 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8) @@ -433,12 +469,16 @@ func (x Float64x4) Store(y *[4]float64) // LoadMaskedFloat64x4 loads a Float64x4 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4 // StoreMasked stores a Float64x4 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4) @@ -502,12 +542,16 @@ func (x Int32x8) Store(y *[8]int32) // LoadMaskedInt32x8 loads a Int32x8 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8 // StoreMasked stores a Int32x8 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8) @@ -533,12 +577,16 @@ func (x Int64x4) Store(y *[4]int64) // LoadMaskedInt64x4 loads a Int64x4 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4 // StoreMasked stores a Int64x4 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4) @@ -602,12 +650,16 @@ func (x Uint32x8) Store(y *[8]uint32) // LoadMaskedUint32x8 loads a Uint32x8 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8 // StoreMasked stores a Uint32x8 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVD, CPU Feature: AVX2 +// //go:noescape func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8) @@ -633,12 +685,16 @@ func (x Uint64x4) Store(y *[4]uint64) // LoadMaskedUint64x4 loads a Uint64x4 from an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4 // StoreMasked stores a Uint64x4 to an array, // at those elements enabled by mask // +// Asm: VMASKMOVQ, CPU Feature: AVX2 +// //go:noescape func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4) @@ -666,6 +722,8 @@ func (x Mask8x32) StoreToBits(y *uint64) // Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 32 bits of y are used. +// +// Asm: KMOVB, CPU Feature: AVX512" func Mask8x32FromBits(y uint32) Mask8x32 // Mask16x16 is a 256-bit SIMD vector of 16 int16 @@ -692,6 +750,8 @@ func (x Mask16x16) StoreToBits(y *uint64) // Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. +// +// Asm: KMOVW, CPU Feature: AVX512" func Mask16x16FromBits(y uint16) Mask16x16 // Mask32x8 is a 256-bit SIMD vector of 8 int32 @@ -718,6 +778,8 @@ func (x Mask32x8) StoreToBits(y *uint64) // Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. +// +// Asm: KMOVD, CPU Feature: AVX512" func Mask32x8FromBits(y uint8) Mask32x8 // Mask64x4 is a 256-bit SIMD vector of 4 int64 @@ -744,6 +806,8 @@ func (x Mask64x4) StoreToBits(y *uint64) // Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. +// +// Asm: KMOVQ, CPU Feature: AVX512" func Mask64x4FromBits(y uint8) Mask64x4 // v512 is a tag type that tells the compiler that this is really 512-bit SIMD @@ -770,6 +834,22 @@ func LoadFloat32x16(y *[16]float32) Float32x16 //go:noescape func (x Float32x16) Store(y *[16]float32) +// LoadMaskedFloat32x16 loads a Float32x16 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU32.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedFloat32x16(y *[16]float32, mask Mask32x16) Float32x16 + +// StoreMasked stores a Float32x16 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU32, CPU Feature: AVX512 +// +//go:noescape +func (x Float32x16) StoreMasked(y *[16]float32, mask Mask32x16) + // Float64x8 is a 512-bit SIMD vector of 8 float64 type Float64x8 struct { float64x8 v512 @@ -789,6 +869,22 @@ func LoadFloat64x8(y *[8]float64) Float64x8 //go:noescape func (x Float64x8) Store(y *[8]float64) +// LoadMaskedFloat64x8 loads a Float64x8 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU64.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedFloat64x8(y *[8]float64, mask Mask64x8) Float64x8 + +// StoreMasked stores a Float64x8 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU64, CPU Feature: AVX512 +// +//go:noescape +func (x Float64x8) StoreMasked(y *[8]float64, mask Mask64x8) + // Int8x64 is a 512-bit SIMD vector of 64 int8 type Int8x64 struct { int8x64 v512 @@ -808,6 +904,22 @@ func LoadInt8x64(y *[64]int8) Int8x64 //go:noescape func (x Int8x64) Store(y *[64]int8) +// LoadMaskedInt8x64 loads a Int8x64 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU8.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedInt8x64(y *[64]int8, mask Mask8x64) Int8x64 + +// StoreMasked stores a Int8x64 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU8, CPU Feature: AVX512 +// +//go:noescape +func (x Int8x64) StoreMasked(y *[64]int8, mask Mask8x64) + // Int16x32 is a 512-bit SIMD vector of 32 int16 type Int16x32 struct { int16x32 v512 @@ -827,6 +939,22 @@ func LoadInt16x32(y *[32]int16) Int16x32 //go:noescape func (x Int16x32) Store(y *[32]int16) +// LoadMaskedInt16x32 loads a Int16x32 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU16.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedInt16x32(y *[32]int16, mask Mask16x32) Int16x32 + +// StoreMasked stores a Int16x32 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU16, CPU Feature: AVX512 +// +//go:noescape +func (x Int16x32) StoreMasked(y *[32]int16, mask Mask16x32) + // Int32x16 is a 512-bit SIMD vector of 16 int32 type Int32x16 struct { int32x16 v512 @@ -846,6 +974,22 @@ func LoadInt32x16(y *[16]int32) Int32x16 //go:noescape func (x Int32x16) Store(y *[16]int32) +// LoadMaskedInt32x16 loads a Int32x16 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU32.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedInt32x16(y *[16]int32, mask Mask32x16) Int32x16 + +// StoreMasked stores a Int32x16 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU32, CPU Feature: AVX512 +// +//go:noescape +func (x Int32x16) StoreMasked(y *[16]int32, mask Mask32x16) + // Int64x8 is a 512-bit SIMD vector of 8 int64 type Int64x8 struct { int64x8 v512 @@ -865,6 +1009,22 @@ func LoadInt64x8(y *[8]int64) Int64x8 //go:noescape func (x Int64x8) Store(y *[8]int64) +// LoadMaskedInt64x8 loads a Int64x8 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU64.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedInt64x8(y *[8]int64, mask Mask64x8) Int64x8 + +// StoreMasked stores a Int64x8 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU64, CPU Feature: AVX512 +// +//go:noescape +func (x Int64x8) StoreMasked(y *[8]int64, mask Mask64x8) + // Uint8x64 is a 512-bit SIMD vector of 64 uint8 type Uint8x64 struct { uint8x64 v512 @@ -884,6 +1044,22 @@ func LoadUint8x64(y *[64]uint8) Uint8x64 //go:noescape func (x Uint8x64) Store(y *[64]uint8) +// LoadMaskedUint8x64 loads a Uint8x64 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU8.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedUint8x64(y *[64]uint8, mask Mask8x64) Uint8x64 + +// StoreMasked stores a Uint8x64 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU8, CPU Feature: AVX512 +// +//go:noescape +func (x Uint8x64) StoreMasked(y *[64]uint8, mask Mask8x64) + // Uint16x32 is a 512-bit SIMD vector of 32 uint16 type Uint16x32 struct { uint16x32 v512 @@ -903,6 +1079,22 @@ func LoadUint16x32(y *[32]uint16) Uint16x32 //go:noescape func (x Uint16x32) Store(y *[32]uint16) +// LoadMaskedUint16x32 loads a Uint16x32 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU16.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedUint16x32(y *[32]uint16, mask Mask16x32) Uint16x32 + +// StoreMasked stores a Uint16x32 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU16, CPU Feature: AVX512 +// +//go:noescape +func (x Uint16x32) StoreMasked(y *[32]uint16, mask Mask16x32) + // Uint32x16 is a 512-bit SIMD vector of 16 uint32 type Uint32x16 struct { uint32x16 v512 @@ -922,6 +1114,22 @@ func LoadUint32x16(y *[16]uint32) Uint32x16 //go:noescape func (x Uint32x16) Store(y *[16]uint32) +// LoadMaskedUint32x16 loads a Uint32x16 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU32.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedUint32x16(y *[16]uint32, mask Mask32x16) Uint32x16 + +// StoreMasked stores a Uint32x16 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU32, CPU Feature: AVX512 +// +//go:noescape +func (x Uint32x16) StoreMasked(y *[16]uint32, mask Mask32x16) + // Uint64x8 is a 512-bit SIMD vector of 8 uint64 type Uint64x8 struct { uint64x8 v512 @@ -941,6 +1149,22 @@ func LoadUint64x8(y *[8]uint64) Uint64x8 //go:noescape func (x Uint64x8) Store(y *[8]uint64) +// LoadMaskedUint64x8 loads a Uint64x8 from an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU64.Z, CPU Feature: AVX512 +// +//go:noescape +func LoadMaskedUint64x8(y *[8]uint64, mask Mask64x8) Uint64x8 + +// StoreMasked stores a Uint64x8 to an array, +// at those elements enabled by mask +// +// Asm: VMOVDQU64, CPU Feature: AVX512 +// +//go:noescape +func (x Uint64x8) StoreMasked(y *[8]uint64, mask Mask64x8) + // Mask8x64 is a 512-bit SIMD vector of 64 int8 type Mask8x64 struct { int8x64 v512 @@ -965,6 +1189,8 @@ func (x Mask8x64) StoreToBits(y *uint64) // Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 64 bits of y are used. +// +// Asm: KMOVB, CPU Feature: AVX512" func Mask8x64FromBits(y uint64) Mask8x64 // Mask16x32 is a 512-bit SIMD vector of 32 int16 @@ -991,6 +1217,8 @@ func (x Mask16x32) StoreToBits(y *uint64) // Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 32 bits of y are used. +// +// Asm: KMOVW, CPU Feature: AVX512" func Mask16x32FromBits(y uint32) Mask16x32 // Mask32x16 is a 512-bit SIMD vector of 16 int32 @@ -1017,6 +1245,8 @@ func (x Mask32x16) StoreToBits(y *uint64) // Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. +// +// Asm: KMOVD, CPU Feature: AVX512" func Mask32x16FromBits(y uint16) Mask32x16 // Mask64x8 is a 512-bit SIMD vector of 8 int64 @@ -1043,4 +1273,6 @@ func (x Mask64x8) StoreToBits(y *uint64) // Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. +// +// Asm: KMOVQ, CPU Feature: AVX512" func Mask64x8FromBits(y uint8) Mask64x8 -- 2.52.0