From 2e71cf1a2a6f289cb0d5e1acaca472394d95600e Mon Sep 17 00:00:00 2001
From: Junyang Shao <shaojunyang@google.com>
Date: Fri, 10 Oct 2025 17:42:59 +0000
Subject: [PATCH] [dev.simd] cmd/compile, simd: remove mask load and stores

We have convert mask to bits already, the API of mask load and stores
are inconsistent with them, also mask load and stores could just be
hidden behind peepholes. So this CL removes them, the next CL will add
the peephole for them.

Change-Id: Ifa7d23fb52bb0efd1785935ead4d703927f16d2b
Reviewed-on: https://go-review.googlesource.com/c/go/+/710915
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/cmd/compile/internal/ssa/_gen/AMD64.rules |  35 --
 .../compile/internal/ssa/_gen/genericOps.go   |  27 +-
 src/cmd/compile/internal/ssa/opGen.go         | 156 ------
 src/cmd/compile/internal/ssa/rewriteAMD64.go  | 492 ------------------
 src/cmd/compile/internal/ssagen/intrinsics.go |  34 --
 .../compile/internal/ssagen/simdintrinsics.go |  24 -
 src/simd/_gen/simdgen/gen_simdIntrinsics.go   |   2 -
 src/simd/_gen/simdgen/gen_simdTypes.go        |  20 -
 src/simd/internal/simd_test/simd_test.go      |  33 --
 src/simd/types_amd64.go                       | 192 -------
 10 files changed, 1 insertion(+), 1014 deletions(-)
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
index 3689c12411..2b44871960 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@@ -1641,41 +1641,6 @@
 
 // SIMD lowering rules
 
-// Mask loads
-(LoadMask8x16 <t> ptr mem) => (VPMOVMToVec8x16 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask8x32 <t> ptr mem) => (VPMOVMToVec8x32 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask8x64 <t> ptr mem) => (VPMOVMToVec8x64 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(LoadMask16x8 <t> ptr mem) => (VPMOVMToVec16x8 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask16x16 <t> ptr mem) => (VPMOVMToVec16x16 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask16x32 <t> ptr mem) => (VPMOVMToVec16x32 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(LoadMask32x4 <t> ptr mem) => (VPMOVMToVec32x4 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask32x8 <t> ptr mem) => (VPMOVMToVec32x8 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask32x16 <t> ptr mem) => (VPMOVMToVec32x16 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(LoadMask64x2 <t> ptr mem) => (VPMOVMToVec64x2 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask64x4 <t> ptr mem) => (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask64x8 <t> ptr mem) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
-(StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
-(StoreMask8x64 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
-
-(StoreMask16x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
-(StoreMask16x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
-(StoreMask16x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
-
-(StoreMask32x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
-(StoreMask32x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
-(StoreMask32x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
-
-(StoreMask64x2 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
-(StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
-(StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
-
-// TODO is this correct?  Should we just do it all from 64-bits?
-
 // Mask conversions
 // integers to masks
 (Cvt16toMask8x16 <t> x) => (VPMOVMToVec8x16 <types.TypeVec128> (KMOVWk <t> x))
diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go
index 6b94fea819..18bd8d7fe9 100644
--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@@ -676,32 +676,7 @@ var genericOps = []opData{
 	{name: "PrefetchCacheStreamed", argLength: 2, hasSideEffects: true}, // Do non-temporal or streamed prefetch arg0 to cache. arg0=addr, arg1=memory.
 
 	// SIMD
-	{name: "ZeroSIMD", argLength: 0},      // zero value of a vector
-	{name: "LoadMask8x16", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask8x32", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask8x64", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask16x8", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask16x16", argLength: 2}, // arg0 = ptr, arg1 = mem
-	{name: "LoadMask16x32", argLength: 2}, // arg0 = ptr, arg1 = mem
-	{name: "LoadMask32x4", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask32x8", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask32x16", argLength: 2}, // arg0 = ptr, arg1 = mem
-	{name: "LoadMask64x2", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask64x4", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask64x8", argLength: 2},  // arg0 = ptr, arg1 = mem
-
-	{name: "StoreMask8x16", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask8x32", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask8x64", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask16x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask16x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask16x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask32x4", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask32x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask32x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
+	{name: "ZeroSIMD", argLength: 0}, // zero value of a vector
 
 	// Convert integers to masks
 	{name: "Cvt16toMask8x16", argLength: 1},  // arg0 = integer mask value
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index aef077bb8e..08b6bffd0e 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -5364,30 +5364,6 @@ const (
 	OpPrefetchCache
 	OpPrefetchCacheStreamed
 	OpZeroSIMD
-	OpLoadMask8x16
-	OpLoadMask8x32
-	OpLoadMask8x64
-	OpLoadMask16x8
-	OpLoadMask16x16
-	OpLoadMask16x32
-	OpLoadMask32x4
-	OpLoadMask32x8
-	OpLoadMask32x16
-	OpLoadMask64x2
-	OpLoadMask64x4
-	OpLoadMask64x8
-	OpStoreMask8x16
-	OpStoreMask8x32
-	OpStoreMask8x64
-	OpStoreMask16x8
-	OpStoreMask16x16
-	OpStoreMask16x32
-	OpStoreMask32x4
-	OpStoreMask32x8
-	OpStoreMask32x16
-	OpStoreMask64x2
-	OpStoreMask64x4
-	OpStoreMask64x8
 	OpCvt16toMask8x16
 	OpCvt32toMask8x32
 	OpCvt64toMask8x64
@@ -75965,138 +75941,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  0,
 		generic: true,
 	},
-	{
-		name:    "LoadMask8x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask8x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask8x64",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask16x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask32x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask64x2",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask64x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask64x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "StoreMask8x16",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask8x32",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask8x64",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask16x8",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask16x16",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask16x32",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask32x4",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask32x8",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask32x16",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask64x2",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask64x4",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask64x8",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
 	{
 		name:    "Cvt16toMask8x16",
 		argLen:  1,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 84bb4c1148..5220a0a73c 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -3769,30 +3769,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpLessUint8x64(v)
 	case OpLoad:
 		return rewriteValueAMD64_OpLoad(v)
-	case OpLoadMask16x16:
-		return rewriteValueAMD64_OpLoadMask16x16(v)
-	case OpLoadMask16x32:
-		return rewriteValueAMD64_OpLoadMask16x32(v)
-	case OpLoadMask16x8:
-		return rewriteValueAMD64_OpLoadMask16x8(v)
-	case OpLoadMask32x16:
-		return rewriteValueAMD64_OpLoadMask32x16(v)
-	case OpLoadMask32x4:
-		return rewriteValueAMD64_OpLoadMask32x4(v)
-	case OpLoadMask32x8:
-		return rewriteValueAMD64_OpLoadMask32x8(v)
-	case OpLoadMask64x2:
-		return rewriteValueAMD64_OpLoadMask64x2(v)
-	case OpLoadMask64x4:
-		return rewriteValueAMD64_OpLoadMask64x4(v)
-	case OpLoadMask64x8:
-		return rewriteValueAMD64_OpLoadMask64x8(v)
-	case OpLoadMask8x16:
-		return rewriteValueAMD64_OpLoadMask8x16(v)
-	case OpLoadMask8x32:
-		return rewriteValueAMD64_OpLoadMask8x32(v)
-	case OpLoadMask8x64:
-		return rewriteValueAMD64_OpLoadMask8x64(v)
 	case OpLoadMasked16:
 		return rewriteValueAMD64_OpLoadMasked16(v)
 	case OpLoadMasked32:
@@ -5636,30 +5612,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return true
 	case OpStore:
 		return rewriteValueAMD64_OpStore(v)
-	case OpStoreMask16x16:
-		return rewriteValueAMD64_OpStoreMask16x16(v)
-	case OpStoreMask16x32:
-		return rewriteValueAMD64_OpStoreMask16x32(v)
-	case OpStoreMask16x8:
-		return rewriteValueAMD64_OpStoreMask16x8(v)
-	case OpStoreMask32x16:
-		return rewriteValueAMD64_OpStoreMask32x16(v)
-	case OpStoreMask32x4:
-		return rewriteValueAMD64_OpStoreMask32x4(v)
-	case OpStoreMask32x8:
-		return rewriteValueAMD64_OpStoreMask32x8(v)
-	case OpStoreMask64x2:
-		return rewriteValueAMD64_OpStoreMask64x2(v)
-	case OpStoreMask64x4:
-		return rewriteValueAMD64_OpStoreMask64x4(v)
-	case OpStoreMask64x8:
-		return rewriteValueAMD64_OpStoreMask64x8(v)
-	case OpStoreMask8x16:
-		return rewriteValueAMD64_OpStoreMask8x16(v)
-	case OpStoreMask8x32:
-		return rewriteValueAMD64_OpStoreMask8x32(v)
-	case OpStoreMask8x64:
-		return rewriteValueAMD64_OpStoreMask8x64(v)
 	case OpStoreMasked16:
 		return rewriteValueAMD64_OpStoreMasked16(v)
 	case OpStoreMasked32:
@@ -54997,222 +54949,6 @@ func rewriteValueAMD64_OpLoad(v *Value) bool {
 	}
 	return false
 }
-func rewriteValueAMD64_OpLoadMask16x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask16x16 <t> ptr mem)
-	// result: (VPMOVMToVec16x16 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec16x16)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask16x32(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask16x32 <t> ptr mem)
-	// result: (VPMOVMToVec16x32 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec16x32)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask16x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask16x8 <t> ptr mem)
-	// result: (VPMOVMToVec16x8 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec16x8)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask32x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask32x16 <t> ptr mem)
-	// result: (VPMOVMToVec32x16 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec32x16)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask32x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask32x4 <t> ptr mem)
-	// result: (VPMOVMToVec32x4 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec32x4)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask32x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask32x8 <t> ptr mem)
-	// result: (VPMOVMToVec32x8 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec32x8)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask64x2(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask64x2 <t> ptr mem)
-	// result: (VPMOVMToVec64x2 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec64x2)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask64x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask64x4 <t> ptr mem)
-	// result: (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec64x4)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask64x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask64x8 <t> ptr mem)
-	// result: (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec64x8)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask8x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask8x16 <t> ptr mem)
-	// result: (VPMOVMToVec8x16 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec8x16)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask8x32(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask8x32 <t> ptr mem)
-	// result: (VPMOVMToVec8x32 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec8x32)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask8x64 <t> ptr mem)
-	// result: (VPMOVMToVec8x64 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec8x64)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpLoadMasked16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -59830,234 +59566,6 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
 	}
 	return false
 }
-func rewriteValueAMD64_OpStoreMask16x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask16x16 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask16x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask16x32 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask16x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask16x8 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask32x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask32x16 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask32x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask32x4 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask32x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask32x8 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask64x2(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask64x2 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask64x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask64x4 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask64x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask64x8 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask8x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask8x16 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask8x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask8x32 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask8x64 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
 func rewriteValueAMD64_OpStoreMasked16(v *Value) bool {
 	v_3 := v.Args[3]
 	v_2 := v.Args[2]
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
index 6561cbe9a2..f663680fc4 100644
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -2024,13 +2024,6 @@ func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	}
 }
 
-var loadMaskOpcodes = map[int]map[int]ssa.Op{
-	8:  {16: ssa.OpLoadMask8x16, 32: ssa.OpLoadMask8x32, 64: ssa.OpLoadMask8x64},
-	16: {8: ssa.OpLoadMask16x8, 16: ssa.OpLoadMask16x16, 32: ssa.OpLoadMask16x32},
-	32: {4: ssa.OpLoadMask32x4, 8: ssa.OpLoadMask32x8, 16: ssa.OpLoadMask32x16},
-	64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8},
-}
-
 var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{
 	8:  {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64},
 	16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32},
@@ -2045,33 +2038,6 @@ var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{
 	64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8},
 }
 
-func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-		op := loadMaskOpcodes[elemBits][lanes]
-		if op == 0 {
-			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
-		}
-		return s.newValue2(op, types.TypeMask, args[0], s.mem())
-	}
-}
-
-func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-		opCodes := map[int]map[int]ssa.Op{
-			8:  {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64},
-			16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32},
-			32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16},
-			64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8},
-		}
-		op := opCodes[elemBits][lanes]
-		if op == 0 {
-			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
-		}
-		s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem())
-		return nil
-	}
-}
-
 func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		op := cvtVToMaskOpcodes[elemBits][lanes]
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index f2e82d234c..47be7d67a4 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1685,96 +1685,72 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int8x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64)
-	addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x16FromBits", simdCvtVToMask(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x16.ToBits", simdCvtMaskToV(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int8x32.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64)
-	addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x32FromBits", simdCvtVToMask(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x32.ToBits", simdCvtMaskToV(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int8x64.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64)
-	addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask8x64FromBits", simdCvtVToMask(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask8x64.ToBits", simdCvtMaskToV(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64)
-	addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x8FromBits", simdCvtVToMask(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x8.ToBits", simdCvtMaskToV(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64)
-	addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x16FromBits", simdCvtVToMask(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x16.ToBits", simdCvtMaskToV(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x32.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64)
-	addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask16x32FromBits", simdCvtVToMask(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask16x32.ToBits", simdCvtMaskToV(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x4.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64)
-	addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x4FromBits", simdCvtVToMask(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x4.ToBits", simdCvtMaskToV(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64)
-	addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x8FromBits", simdCvtVToMask(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x8.ToBits", simdCvtMaskToV(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64)
-	addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask32x16FromBits", simdCvtVToMask(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask32x16.ToBits", simdCvtMaskToV(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x2.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64)
-	addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x2FromBits", simdCvtVToMask(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x2.ToBits", simdCvtMaskToV(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x4.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64)
-	addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x4FromBits", simdCvtVToMask(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x4.ToBits", simdCvtMaskToV(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64)
-	addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64)
 	addF(simdPackage, "Mask64x8FromBits", simdCvtVToMask(64, 8), sys.AMD64)
 	addF(simdPackage, "Mask64x8.ToBits", simdCvtMaskToV(64, 8), sys.AMD64)
 }
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
index 4b27f7ce5f..a59bd9d658 100644
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -80,8 +80,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "{{.VectorCounterpart}}.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "{{.Name}}.And", opLen2(ssa.OpAnd{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
 	addF(simdPackage, "{{.Name}}.Or", opLen2(ssa.OpOr{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
-	addF(simdPackage, "Load{{.Name}}FromBits", simdLoadMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
-	addF(simdPackage, "{{.Name}}.StoreToBits", simdStoreMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
 	addF(simdPackage, "{{.Name}}FromBits", simdCvtVToMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
 	addF(simdPackage, "{{.Name}}.ToBits", simdCvtMaskToV({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
 {{end}}
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go
index f13be87f7b..d443fff16e 100644
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -180,22 +180,6 @@ func Load{{.Name}}(y *[{{.Lanes}}]{{.Base}}) {{.Name}}
 func (x {{.Name}}) Store(y *[{{.Lanes}}]{{.Base}})
 `
 
-const simdMaskFromBitsTemplate = `
-// Load{{.Name}}FromBits constructs a {{.Name}} from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower {{.Lanes}} bits of y are used.
-//
-// CPU Features: AVX512
-//go:noescape
-func Load{{.Name}}FromBits(y *uint64) {{.Name}}
-
-// StoreToBits stores a {{.Name}} as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower {{.Lanes}} bits of y are used.
-//
-// CPU Features: AVX512
-//go:noescape
-func (x {{.Name}}) StoreToBits(y *uint64)
-`
-
 const simdMaskFromValTemplate = `
 // {{.Name}}FromBits constructs a {{.Name}} from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower {{.Lanes}} bits of y are used.
@@ -503,7 +487,6 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
 	t := templateOf(simdTypesTemplates, "types_amd64")
 	loadStore := templateOf(simdLoadStoreTemplate, "loadstore_amd64")
 	maskedLoadStore := templateOf(simdMaskedLoadStoreTemplate, "maskedloadstore_amd64")
-	maskFromBits := templateOf(simdMaskFromBitsTemplate, "maskFromBits_amd64")
 	maskFromVal := templateOf(simdMaskFromValTemplate, "maskFromVal_amd64")
 
 	buffer := new(bytes.Buffer)
@@ -542,9 +525,6 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
 					}
 				}
 			} else {
-				if err := maskFromBits.ExecuteTemplate(buffer, "maskFromBits_amd64", typeDef); err != nil {
-					panic(fmt.Errorf("failed to execute maskFromBits template for type %s: %w", typeDef.Name, err))
-				}
 				if err := maskFromVal.ExecuteTemplate(buffer, "maskFromVal_amd64", typeDef); err != nil {
 					panic(fmt.Errorf("failed to execute maskFromVal template for type %s: %w", typeDef.Name, err))
 				}
diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go
index d00fcf5dd3..2c866ad68b 100644
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@@ -332,39 +332,6 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si
 	}
 }
 
-func TestBitMaskLoad(t *testing.T) {
-	if !simd.HasAVX512() {
-		t.Skip("Test requires HasAVX512, not available on this hardware")
-		return
-	}
-	var bits uint64 = 0b10
-	results := [2]int64{}
-	want := [2]int64{0, 6}
-	m := simd.LoadMask64x2FromBits(&bits)
-	simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
-	for i := range 2 {
-		if results[i] != want[i] {
-			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
-		}
-	}
-}
-
-func TestBitMaskStore(t *testing.T) {
-	if !simd.HasAVX512() {
-		t.Skip("Test requires HasAVX512, not available on this hardware")
-		return
-	}
-	var want uint64 = 0b101
-	var got uint64
-	x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
-	y := simd.LoadInt32x4Slice([]int32{5, 0, 5, 0})
-	m := y.Greater(x)
-	m.StoreToBits(&got)
-	if got != want {
-		t.Errorf("Result incorrect: want %b, got %b", want, got)
-	}
-}
-
 func TestBitMaskFromBits(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Test requires HasAVX512, not available on this hardware")
diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go
index 72547c7602..0136f49f91 100644
--- a/src/simd/types_amd64.go
+++ b/src/simd/types_amd64.go
@@ -301,22 +301,6 @@ type Mask8x16 struct {
 	vals    [16]int8
 }
 
-// LoadMask8x16FromBits constructs a Mask8x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask8x16FromBits(y *uint64) Mask8x16
-
-// StoreToBits stores a Mask8x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask8x16) StoreToBits(y *uint64)
-
 // Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
@@ -335,22 +319,6 @@ type Mask16x8 struct {
 	vals    [8]int16
 }
 
-// LoadMask16x8FromBits constructs a Mask16x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask16x8FromBits(y *uint64) Mask16x8
-
-// StoreToBits stores a Mask16x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask16x8) StoreToBits(y *uint64)
-
 // Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
@@ -369,22 +337,6 @@ type Mask32x4 struct {
 	vals    [4]int32
 }
 
-// LoadMask32x4FromBits constructs a Mask32x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask32x4FromBits(y *uint64) Mask32x4
-
-// StoreToBits stores a Mask32x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask32x4) StoreToBits(y *uint64)
-
 // Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
@@ -403,22 +355,6 @@ type Mask64x2 struct {
 	vals    [2]int64
 }
 
-// LoadMask64x2FromBits constructs a Mask64x2 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 2 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask64x2FromBits(y *uint64) Mask64x2
-
-// StoreToBits stores a Mask64x2 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 2 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask64x2) StoreToBits(y *uint64)
-
 // Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 2 bits of y are used.
 //
@@ -728,22 +664,6 @@ type Mask8x32 struct {
 	vals    [32]int8
 }
 
-// LoadMask8x32FromBits constructs a Mask8x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask8x32FromBits(y *uint64) Mask8x32
-
-// StoreToBits stores a Mask8x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask8x32) StoreToBits(y *uint64)
-
 // Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
@@ -762,22 +682,6 @@ type Mask16x16 struct {
 	vals     [16]int16
 }
 
-// LoadMask16x16FromBits constructs a Mask16x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask16x16FromBits(y *uint64) Mask16x16
-
-// StoreToBits stores a Mask16x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask16x16) StoreToBits(y *uint64)
-
 // Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
@@ -796,22 +700,6 @@ type Mask32x8 struct {
 	vals    [8]int32
 }
 
-// LoadMask32x8FromBits constructs a Mask32x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask32x8FromBits(y *uint64) Mask32x8
-
-// StoreToBits stores a Mask32x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask32x8) StoreToBits(y *uint64)
-
 // Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
@@ -830,22 +718,6 @@ type Mask64x4 struct {
 	vals    [4]int64
 }
 
-// LoadMask64x4FromBits constructs a Mask64x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask64x4FromBits(y *uint64) Mask64x4
-
-// StoreToBits stores a Mask64x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask64x4) StoreToBits(y *uint64)
-
 // Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
@@ -1219,22 +1091,6 @@ type Mask8x64 struct {
 	vals    [64]int8
 }
 
-// LoadMask8x64FromBits constructs a Mask8x64 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 64 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask8x64FromBits(y *uint64) Mask8x64
-
-// StoreToBits stores a Mask8x64 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 64 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask8x64) StoreToBits(y *uint64)
-
 // Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 64 bits of y are used.
 //
@@ -1253,22 +1109,6 @@ type Mask16x32 struct {
 	vals     [32]int16
 }
 
-// LoadMask16x32FromBits constructs a Mask16x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask16x32FromBits(y *uint64) Mask16x32
-
-// StoreToBits stores a Mask16x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask16x32) StoreToBits(y *uint64)
-
 // Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
@@ -1287,22 +1127,6 @@ type Mask32x16 struct {
 	vals     [16]int32
 }
 
-// LoadMask32x16FromBits constructs a Mask32x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask32x16FromBits(y *uint64) Mask32x16
-
-// StoreToBits stores a Mask32x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask32x16) StoreToBits(y *uint64)
-
 // Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
@@ -1321,22 +1145,6 @@ type Mask64x8 struct {
 	vals    [8]int64
 }
 
-// LoadMask64x8FromBits constructs a Mask64x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask64x8FromBits(y *uint64) Mask64x8
-
-// StoreToBits stores a Mask64x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask64x8) StoreToBits(y *uint64)
-
 // Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
-- 
2.52.0