floats: []int{32},
}
-var avx512MaskedLoadShapes = &shapes{
+var avx512Shapes = &shapes{
vecs: []int{512},
ints: []int{8, 16, 32, 64},
uints: []int{8, 16, 32, 64},
floats: []int{32, 64},
}
+var avx2Shapes = &shapes{
+ vecs: []int{128, 256},
+ ints: []int{8, 16, 32, 64},
+ uints: []int{8, 16, 32, 64},
+ floats: []int{32, 64},
+}
+
var avx2MaskedLoadShapes = &shapes{
vecs: []int{128, 256},
ints: []int{32, 64},
uints: []int{8, 16},
}
-var unaryFlaky = &shapes{
+var unaryFlaky = &shapes{ // for tests that support flaky equality
vecs: []int{128, 256, 512},
floats: []int{32, 64},
}
-var ternaryFlaky = &shapes{
+var ternaryFlaky = &shapes{ // for tests that support flaky equality
vecs: []int{128, 256, 512},
floats: []int{32},
}
BaseType := strings.ToUpper(baseType[:1]) + baseType[1:]
eType := fmt.Sprintf("%s%d", baseType, width)
wxc := fmt.Sprintf("%dx%d", width, count)
+ bxc := fmt.Sprintf("%dx%d", 8, count*(width/8))
vType := fmt.Sprintf("%s%s", BaseType, wxc)
aOrAn := "a"
if strings.Contains("aeiou", baseType[:1]) {
Width int // the bit width of the element type, e.g. 32
Count int // the number of elements, e.g. 4
WxC string // the width-by-type string, e.g., "32x4"
+ BxC string // as if bytes, in the proper count, e.g., "8x16" (W==8)
+ Base string // the capitalized Base Type of the vector, e.g., "Float"
Type string // the element type, e.g. "float32"
OxFF string // a mask for the lowest 'count' bits
}{
Width: width,
Count: count,
WxC: wxc,
+ BxC: bxc,
+ Base: BaseType,
Type: eType,
OxFF: oxFF,
})
}
`)
-var avx512MaskedLoadSlicePartTemplate = shapedTemplateOf(avx512MaskedLoadShapes, "avx 512 load slice part", `
+var avx512MaskedLoadSlicePartTemplate = shapedTemplateOf(avx512Shapes, "avx 512 load slice part", `
// Load{{.Vec}}SlicePart loads a {{.Vec}} from the slice s.
// If s has fewer than {{.Count}} elements, the remaining elements of the vector are filled with zeroes.
// If s has {{.Count}} or more elements, the function is equivalent to Load{{.Vec}}Slice.
var x {{.Vec}}
return x
}
-
mask := Mask{{.WxC}}FromBits({{.OxFF}} >> ({{.Count}} - l))
return LoadMasked{{.Vec}}(pa{{.Vec}}(s), mask)
}
}
`)
+var avx2MaskedTemplate = shapedTemplateOf(avx2Shapes, "avx2 .Masked methods", `
+// Masked returns x but with elements zeroed where mask is false.
+func (x {{.Vec}}) Masked(mask Mask{{.WxC}}) {{.Vec}} {
+ im := mask.AsInt{{.WxC}}()
+{{- if eq .Base "Int" }}
+ return im.And(x)
+{{- else}}
+ return x.AsInt{{.WxC}}().And(im).As{{.Vec}}()
+{{- end -}}
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x {{.Vec}}) Merge(y {{.Vec}}, mask Mask{{.WxC}}) {{.Vec}} {
+{{- if eq .BxC .WxC }}
+ im := mask.AsInt{{.BxC}}()
+{{- else}}
+ im := mask.AsInt{{.WxC}}().AsInt{{.BxC}}()
+{{- end -}}
+{{- if and (eq .Base "Int") (eq .BxC .WxC) }}
+ return y.blend(x, im)
+{{- else}}
+ ix := x.AsInt{{.BxC}}()
+ iy := y.AsInt{{.BxC}}()
+ return iy.blend(ix, im).As{{.Vec}}()
+{{- end -}}
+}
+`)
+
+// TODO perhaps write these in ways that work better on AVX512
+var avx512MaskedTemplate = shapedTemplateOf(avx512Shapes, "avx512 .Masked methods", `
+// Masked returns x but with elements zeroed where mask is false.
+func (x {{.Vec}}) Masked(mask Mask{{.WxC}}) {{.Vec}} {
+ im := mask.AsInt{{.WxC}}()
+{{- if eq .Base "Int" }}
+ return im.And(x)
+{{- else}}
+ return x.AsInt{{.WxC}}().And(im).As{{.Vec}}()
+{{- end -}}
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x {{.Vec}}) Merge(y {{.Vec}}, mask Mask{{.WxC}}) {{.Vec}} {
+{{- if eq .Base "Int" }}
+ return y.blendMasked(x, mask)
+{{- else}}
+ ix := x.AsInt{{.WxC}}()
+ iy := y.AsInt{{.WxC}}()
+ return iy.blendMasked(ix, mask).As{{.Vec}}()
+{{- end -}}
+}
+`)
+
func main() {
sl := flag.String("sl", "slice_amd64.go", "file name for slice operations")
ush := flag.String("ush", "unsafe_helpers.go", "file name for unsafe helpers")
flag.Parse()
if *sl != "" {
- one(*sl, prologue, sliceTemplate, avx512MaskedLoadSlicePartTemplate, avx2MaskedLoadSlicePartTemplate, avx2SmallLoadSlicePartTemplate)
+ one(*sl, prologue,
+ sliceTemplate,
+ avx512MaskedLoadSlicePartTemplate,
+ avx2MaskedLoadSlicePartTemplate,
+ avx2SmallLoadSlicePartTemplate,
+ avx2MaskedTemplate,
+ avx512MaskedTemplate,
+ )
}
if *ush != "" {
one(*ush, unsafePrologue, unsafePATemplate)
var x Int8x64
return x
}
-
mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
return LoadMaskedInt8x64(paInt8x64(s), mask)
}
var x Int16x32
return x
}
-
mask := Mask16x32FromBits(0xffffffff >> (32 - l))
return LoadMaskedInt16x32(paInt16x32(s), mask)
}
var x Int32x16
return x
}
-
mask := Mask32x16FromBits(0xffff >> (16 - l))
return LoadMaskedInt32x16(paInt32x16(s), mask)
}
var x Int64x8
return x
}
-
mask := Mask64x8FromBits(0xff >> (8 - l))
return LoadMaskedInt64x8(paInt64x8(s), mask)
}
var x Uint8x64
return x
}
-
mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
return LoadMaskedUint8x64(paUint8x64(s), mask)
}
var x Uint16x32
return x
}
-
mask := Mask16x32FromBits(0xffffffff >> (32 - l))
return LoadMaskedUint16x32(paUint16x32(s), mask)
}
var x Uint32x16
return x
}
-
mask := Mask32x16FromBits(0xffff >> (16 - l))
return LoadMaskedUint32x16(paUint32x16(s), mask)
}
var x Uint64x8
return x
}
-
mask := Mask64x8FromBits(0xff >> (8 - l))
return LoadMaskedUint64x8(paUint64x8(s), mask)
}
var x Float32x16
return x
}
-
mask := Mask32x16FromBits(0xffff >> (16 - l))
return LoadMaskedFloat32x16(paFloat32x16(s), mask)
}
var x Float64x8
return x
}
-
mask := Mask64x8FromBits(0xff >> (8 - l))
return LoadMaskedFloat64x8(paFloat64x8(s), mask)
}
t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
x.AsInt16x16().StoreSlicePart(t)
}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int8x16) Masked(mask Mask8x16) Int8x16 {
+ im := mask.AsInt8x16()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int8x16) Merge(y Int8x16, mask Mask8x16) Int8x16 {
+ im := mask.AsInt8x16()
+ return y.blend(x, im)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int16x8) Masked(mask Mask16x8) Int16x8 {
+ im := mask.AsInt16x8()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int16x8) Merge(y Int16x8, mask Mask16x8) Int16x8 {
+ im := mask.AsInt16x8().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsInt16x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int32x4) Masked(mask Mask32x4) Int32x4 {
+ im := mask.AsInt32x4()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int32x4) Merge(y Int32x4, mask Mask32x4) Int32x4 {
+ im := mask.AsInt32x4().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsInt32x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int64x2) Masked(mask Mask64x2) Int64x2 {
+ im := mask.AsInt64x2()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int64x2) Merge(y Int64x2, mask Mask64x2) Int64x2 {
+ im := mask.AsInt64x2().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsInt64x2()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint8x16) Masked(mask Mask8x16) Uint8x16 {
+ im := mask.AsInt8x16()
+ return x.AsInt8x16().And(im).AsUint8x16()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint8x16) Merge(y Uint8x16, mask Mask8x16) Uint8x16 {
+ im := mask.AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint8x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint16x8) Masked(mask Mask16x8) Uint16x8 {
+ im := mask.AsInt16x8()
+ return x.AsInt16x8().And(im).AsUint16x8()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint16x8) Merge(y Uint16x8, mask Mask16x8) Uint16x8 {
+ im := mask.AsInt16x8().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint16x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint32x4) Masked(mask Mask32x4) Uint32x4 {
+ im := mask.AsInt32x4()
+ return x.AsInt32x4().And(im).AsUint32x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint32x4) Merge(y Uint32x4, mask Mask32x4) Uint32x4 {
+ im := mask.AsInt32x4().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint32x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint64x2) Masked(mask Mask64x2) Uint64x2 {
+ im := mask.AsInt64x2()
+ return x.AsInt64x2().And(im).AsUint64x2()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint64x2) Merge(y Uint64x2, mask Mask64x2) Uint64x2 {
+ im := mask.AsInt64x2().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint64x2()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float32x4) Masked(mask Mask32x4) Float32x4 {
+ im := mask.AsInt32x4()
+ return x.AsInt32x4().And(im).AsFloat32x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float32x4) Merge(y Float32x4, mask Mask32x4) Float32x4 {
+ im := mask.AsInt32x4().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsFloat32x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float64x2) Masked(mask Mask64x2) Float64x2 {
+ im := mask.AsInt64x2()
+ return x.AsInt64x2().And(im).AsFloat64x2()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float64x2) Merge(y Float64x2, mask Mask64x2) Float64x2 {
+ im := mask.AsInt64x2().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsFloat64x2()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int8x32) Masked(mask Mask8x32) Int8x32 {
+ im := mask.AsInt8x32()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int8x32) Merge(y Int8x32, mask Mask8x32) Int8x32 {
+ im := mask.AsInt8x32()
+ return y.blend(x, im)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int16x16) Masked(mask Mask16x16) Int16x16 {
+ im := mask.AsInt16x16()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int16x16) Merge(y Int16x16, mask Mask16x16) Int16x16 {
+ im := mask.AsInt16x16().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsInt16x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int32x8) Masked(mask Mask32x8) Int32x8 {
+ im := mask.AsInt32x8()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int32x8) Merge(y Int32x8, mask Mask32x8) Int32x8 {
+ im := mask.AsInt32x8().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsInt32x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int64x4) Masked(mask Mask64x4) Int64x4 {
+ im := mask.AsInt64x4()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int64x4) Merge(y Int64x4, mask Mask64x4) Int64x4 {
+ im := mask.AsInt64x4().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsInt64x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint8x32) Masked(mask Mask8x32) Uint8x32 {
+ im := mask.AsInt8x32()
+ return x.AsInt8x32().And(im).AsUint8x32()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint8x32) Merge(y Uint8x32, mask Mask8x32) Uint8x32 {
+ im := mask.AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint8x32()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint16x16) Masked(mask Mask16x16) Uint16x16 {
+ im := mask.AsInt16x16()
+ return x.AsInt16x16().And(im).AsUint16x16()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint16x16) Merge(y Uint16x16, mask Mask16x16) Uint16x16 {
+ im := mask.AsInt16x16().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint16x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint32x8) Masked(mask Mask32x8) Uint32x8 {
+ im := mask.AsInt32x8()
+ return x.AsInt32x8().And(im).AsUint32x8()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint32x8) Merge(y Uint32x8, mask Mask32x8) Uint32x8 {
+ im := mask.AsInt32x8().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint32x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint64x4) Masked(mask Mask64x4) Uint64x4 {
+ im := mask.AsInt64x4()
+ return x.AsInt64x4().And(im).AsUint64x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint64x4) Merge(y Uint64x4, mask Mask64x4) Uint64x4 {
+ im := mask.AsInt64x4().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint64x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float32x8) Masked(mask Mask32x8) Float32x8 {
+ im := mask.AsInt32x8()
+ return x.AsInt32x8().And(im).AsFloat32x8()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float32x8) Merge(y Float32x8, mask Mask32x8) Float32x8 {
+ im := mask.AsInt32x8().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsFloat32x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float64x4) Masked(mask Mask64x4) Float64x4 {
+ im := mask.AsInt64x4()
+ return x.AsInt64x4().And(im).AsFloat64x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float64x4) Merge(y Float64x4, mask Mask64x4) Float64x4 {
+ im := mask.AsInt64x4().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsFloat64x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int8x64) Masked(mask Mask8x64) Int8x64 {
+ im := mask.AsInt8x64()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int8x64) Merge(y Int8x64, mask Mask8x64) Int8x64 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int16x32) Masked(mask Mask16x32) Int16x32 {
+ im := mask.AsInt16x32()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int16x32) Merge(y Int16x32, mask Mask16x32) Int16x32 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int32x16) Masked(mask Mask32x16) Int32x16 {
+ im := mask.AsInt32x16()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int32x16) Merge(y Int32x16, mask Mask32x16) Int32x16 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int64x8) Masked(mask Mask64x8) Int64x8 {
+ im := mask.AsInt64x8()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int64x8) Merge(y Int64x8, mask Mask64x8) Int64x8 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint8x64) Masked(mask Mask8x64) Uint8x64 {
+ im := mask.AsInt8x64()
+ return x.AsInt8x64().And(im).AsUint8x64()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint8x64) Merge(y Uint8x64, mask Mask8x64) Uint8x64 {
+ ix := x.AsInt8x64()
+ iy := y.AsInt8x64()
+ return iy.blendMasked(ix, mask).AsUint8x64()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint16x32) Masked(mask Mask16x32) Uint16x32 {
+ im := mask.AsInt16x32()
+ return x.AsInt16x32().And(im).AsUint16x32()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint16x32) Merge(y Uint16x32, mask Mask16x32) Uint16x32 {
+ ix := x.AsInt16x32()
+ iy := y.AsInt16x32()
+ return iy.blendMasked(ix, mask).AsUint16x32()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint32x16) Masked(mask Mask32x16) Uint32x16 {
+ im := mask.AsInt32x16()
+ return x.AsInt32x16().And(im).AsUint32x16()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint32x16) Merge(y Uint32x16, mask Mask32x16) Uint32x16 {
+ ix := x.AsInt32x16()
+ iy := y.AsInt32x16()
+ return iy.blendMasked(ix, mask).AsUint32x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint64x8) Masked(mask Mask64x8) Uint64x8 {
+ im := mask.AsInt64x8()
+ return x.AsInt64x8().And(im).AsUint64x8()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint64x8) Merge(y Uint64x8, mask Mask64x8) Uint64x8 {
+ ix := x.AsInt64x8()
+ iy := y.AsInt64x8()
+ return iy.blendMasked(ix, mask).AsUint64x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float32x16) Masked(mask Mask32x16) Float32x16 {
+ im := mask.AsInt32x16()
+ return x.AsInt32x16().And(im).AsFloat32x16()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Float32x16) Merge(y Float32x16, mask Mask32x16) Float32x16 {
+ ix := x.AsInt32x16()
+ iy := y.AsInt32x16()
+ return iy.blendMasked(ix, mask).AsFloat32x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float64x8) Masked(mask Mask64x8) Float64x8 {
+ im := mask.AsInt64x8()
+ return x.AsInt64x8().And(im).AsFloat64x8()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Float64x8) Merge(y Float64x8, mask Mask64x8) Float64x8 {
+ ix := x.AsInt64x8()
+ iy := y.AsInt64x8()
+ return iy.blendMasked(ix, mask).AsFloat64x8()
+}