[dev.simd] simd, cmd/compile: rename some methods

author David Chase <drchase@google.com>

Fri, 1 Aug 2025 19:58:29 +0000 (15:58 -0400)

committer David Chase <drchase@google.com>

Mon, 4 Aug 2025 18:53:11 +0000 (11:53 -0700)
author David Chase <drchase@google.com>
Fri, 1 Aug 2025 19:58:29 +0000 (15:58 -0400)
committer David Chase <drchase@google.com>
Mon, 4 Aug 2025 18:53:11 +0000 (11:53 -0700)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index 15ffbf66fa7cbba6d88a18b8b81450ac5f875ae1..76ef42576d32c31f4aba74d7cda9a8eb475c26d7 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -80,6 +80,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPADDQ128,
                 ssa.OpAMD64VPADDQ256,
                 ssa.OpAMD64VPADDQ512,
+               ssa.OpAMD64VHADDPS128,
+               ssa.OpAMD64VHADDPS256,
+               ssa.OpAMD64VHADDPD128,
+               ssa.OpAMD64VHADDPD256,
+               ssa.OpAMD64VPHADDW128,
+               ssa.OpAMD64VPHADDW256,
+               ssa.OpAMD64VPHADDD128,
+               ssa.OpAMD64VPHADDD256,
+               ssa.OpAMD64VPHADDSW128,
+               ssa.OpAMD64VPHADDSW256,
+               ssa.OpAMD64VPADDSB128,
+               ssa.OpAMD64VPADDSB256,
+               ssa.OpAMD64VPADDSB512,
+               ssa.OpAMD64VPADDSW128,
+               ssa.OpAMD64VPADDSW256,
+               ssa.OpAMD64VPADDSW512,
                 ssa.OpAMD64VADDSUBPS128,
                 ssa.OpAMD64VADDSUBPS256,
                 ssa.OpAMD64VADDSUBPD128,
@@ -189,12 +205,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VMULPD128,
                 ssa.OpAMD64VMULPD256,
                 ssa.OpAMD64VMULPD512,
-               ssa.OpAMD64VSCALEFPS128,
-               ssa.OpAMD64VSCALEFPS256,
-               ssa.OpAMD64VSCALEFPS512,
-               ssa.OpAMD64VSCALEFPD128,
-               ssa.OpAMD64VSCALEFPD256,
-               ssa.OpAMD64VSCALEFPD512,
+               ssa.OpAMD64VPMULLW128,
+               ssa.OpAMD64VPMULLW256,
+               ssa.OpAMD64VPMULLW512,
+               ssa.OpAMD64VPMULLD128,
+               ssa.OpAMD64VPMULLD256,
+               ssa.OpAMD64VPMULLD512,
+               ssa.OpAMD64VPMULLQ128,
+               ssa.OpAMD64VPMULLQ256,
+               ssa.OpAMD64VPMULLQ512,
                 ssa.OpAMD64VPMULDQ128,
                 ssa.OpAMD64VPMULDQ256,
                 ssa.OpAMD64VPMULDQ512,
@@ -207,15 +226,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMULHUW128,
                 ssa.OpAMD64VPMULHUW256,
                 ssa.OpAMD64VPMULHUW512,
-               ssa.OpAMD64VPMULLW128,
-               ssa.OpAMD64VPMULLW256,
-               ssa.OpAMD64VPMULLW512,
-               ssa.OpAMD64VPMULLD128,
-               ssa.OpAMD64VPMULLD256,
-               ssa.OpAMD64VPMULLD512,
-               ssa.OpAMD64VPMULLQ128,
-               ssa.OpAMD64VPMULLQ256,
-               ssa.OpAMD64VPMULLQ512,
                 ssa.OpAMD64VPOR128,
                 ssa.OpAMD64VPOR256,
                 ssa.OpAMD64VPORD512,
@@ -223,22 +233,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMADDWD128,
                 ssa.OpAMD64VPMADDWD256,
                 ssa.OpAMD64VPMADDWD512,
-               ssa.OpAMD64VHADDPS128,
-               ssa.OpAMD64VHADDPS256,
-               ssa.OpAMD64VHADDPD128,
-               ssa.OpAMD64VHADDPD256,
-               ssa.OpAMD64VPHADDW128,
-               ssa.OpAMD64VPHADDW256,
-               ssa.OpAMD64VPHADDD128,
-               ssa.OpAMD64VPHADDD256,
-               ssa.OpAMD64VHSUBPS128,
-               ssa.OpAMD64VHSUBPS256,
-               ssa.OpAMD64VHSUBPD128,
-               ssa.OpAMD64VHSUBPD256,
-               ssa.OpAMD64VPHSUBW128,
-               ssa.OpAMD64VPHSUBW256,
-               ssa.OpAMD64VPHSUBD128,
-               ssa.OpAMD64VPHSUBD256,
                 ssa.OpAMD64VPERMB128,
                 ssa.OpAMD64VPERMB256,
                 ssa.OpAMD64VPERMB512,
@@ -265,25 +259,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORVQ128,
                 ssa.OpAMD64VPRORVQ256,
                 ssa.OpAMD64VPRORVQ512,
-               ssa.OpAMD64VPADDSB128,
-               ssa.OpAMD64VPADDSB256,
-               ssa.OpAMD64VPADDSB512,
-               ssa.OpAMD64VPADDSW128,
-               ssa.OpAMD64VPADDSW256,
-               ssa.OpAMD64VPADDSW512,
-               ssa.OpAMD64VPHADDSW128,
-               ssa.OpAMD64VPHADDSW256,
-               ssa.OpAMD64VPHSUBSW128,
-               ssa.OpAMD64VPHSUBSW256,
-               ssa.OpAMD64VPSUBSB128,
-               ssa.OpAMD64VPSUBSB256,
-               ssa.OpAMD64VPSUBSB512,
-               ssa.OpAMD64VPSUBSW128,
-               ssa.OpAMD64VPSUBSW256,
-               ssa.OpAMD64VPSUBSW512,
                 ssa.OpAMD64VPMADDUBSW128,
                 ssa.OpAMD64VPMADDUBSW256,
                 ssa.OpAMD64VPMADDUBSW512,
+               ssa.OpAMD64VSCALEFPS128,
+               ssa.OpAMD64VSCALEFPS256,
+               ssa.OpAMD64VSCALEFPS512,
+               ssa.OpAMD64VSCALEFPD128,
+               ssa.OpAMD64VSCALEFPD256,
+               ssa.OpAMD64VSCALEFPD512,
                 ssa.OpAMD64VPSLLVW128,
                 ssa.OpAMD64VPSLLVW256,
                 ssa.OpAMD64VPSLLVW512,
@@ -335,6 +319,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPSUBQ128,
                 ssa.OpAMD64VPSUBQ256,
                 ssa.OpAMD64VPSUBQ512,
+               ssa.OpAMD64VHSUBPS128,
+               ssa.OpAMD64VHSUBPS256,
+               ssa.OpAMD64VHSUBPD128,
+               ssa.OpAMD64VHSUBPD256,
+               ssa.OpAMD64VPHSUBW128,
+               ssa.OpAMD64VPHSUBW256,
+               ssa.OpAMD64VPHSUBD128,
+               ssa.OpAMD64VPHSUBD256,
+               ssa.OpAMD64VPHSUBSW128,
+               ssa.OpAMD64VPHSUBSW256,
+               ssa.OpAMD64VPSUBSB128,
+               ssa.OpAMD64VPSUBSB256,
+               ssa.OpAMD64VPSUBSB512,
+               ssa.OpAMD64VPSUBSW128,
+               ssa.OpAMD64VPSUBSW256,
+               ssa.OpAMD64VPSUBSW512,
                 ssa.OpAMD64VPXOR128,
                 ssa.OpAMD64VPXOR256,
                 ssa.OpAMD64VPXORD512,
@@ -369,6 +369,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPADDQMasked128,
                 ssa.OpAMD64VPADDQMasked256,
                 ssa.OpAMD64VPADDQMasked512,
+               ssa.OpAMD64VPADDSBMasked128,
+               ssa.OpAMD64VPADDSBMasked256,
+               ssa.OpAMD64VPADDSBMasked512,
+               ssa.OpAMD64VPADDSWMasked128,
+               ssa.OpAMD64VPADDSWMasked256,
+               ssa.OpAMD64VPADDSWMasked512,
                 ssa.OpAMD64VPANDDMasked128,
                 ssa.OpAMD64VPANDDMasked256,
                 ssa.OpAMD64VPANDDMasked512,
@@ -456,12 +462,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMINUQMasked128,
                 ssa.OpAMD64VPMINUQMasked256,
                 ssa.OpAMD64VPMINUQMasked512,
-               ssa.OpAMD64VSCALEFPSMasked128,
-               ssa.OpAMD64VSCALEFPSMasked256,
-               ssa.OpAMD64VSCALEFPSMasked512,
-               ssa.OpAMD64VSCALEFPDMasked128,
-               ssa.OpAMD64VSCALEFPDMasked256,
-               ssa.OpAMD64VSCALEFPDMasked512,
                 ssa.OpAMD64VPMULDQMasked128,
                 ssa.OpAMD64VPMULDQMasked256,
                 ssa.OpAMD64VPMULDQMasked512,
@@ -474,6 +474,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMULHUWMasked128,
                 ssa.OpAMD64VPMULHUWMasked256,
                 ssa.OpAMD64VPMULHUWMasked512,
+               ssa.OpAMD64VMULPSMasked128,
+               ssa.OpAMD64VMULPSMasked256,
+               ssa.OpAMD64VMULPSMasked512,
+               ssa.OpAMD64VMULPDMasked128,
+               ssa.OpAMD64VMULPDMasked256,
+               ssa.OpAMD64VMULPDMasked512,
                 ssa.OpAMD64VPMULLWMasked128,
                 ssa.OpAMD64VPMULLWMasked256,
                 ssa.OpAMD64VPMULLWMasked512,
@@ -483,12 +489,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMULLQMasked128,
                 ssa.OpAMD64VPMULLQMasked256,
                 ssa.OpAMD64VPMULLQMasked512,
-               ssa.OpAMD64VMULPSMasked128,
-               ssa.OpAMD64VMULPSMasked256,
-               ssa.OpAMD64VMULPSMasked512,
-               ssa.OpAMD64VMULPDMasked128,
-               ssa.OpAMD64VMULPDMasked256,
-               ssa.OpAMD64VMULPDMasked512,
                 ssa.OpAMD64VPORDMasked128,
                 ssa.OpAMD64VPORDMasked256,
                 ssa.OpAMD64VPORDMasked512,
@@ -524,21 +524,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORVQMasked128,
                 ssa.OpAMD64VPRORVQMasked256,
                 ssa.OpAMD64VPRORVQMasked512,
-               ssa.OpAMD64VPADDSBMasked128,
-               ssa.OpAMD64VPADDSBMasked256,
-               ssa.OpAMD64VPADDSBMasked512,
-               ssa.OpAMD64VPADDSWMasked128,
-               ssa.OpAMD64VPADDSWMasked256,
-               ssa.OpAMD64VPADDSWMasked512,
-               ssa.OpAMD64VPSUBSBMasked128,
-               ssa.OpAMD64VPSUBSBMasked256,
-               ssa.OpAMD64VPSUBSBMasked512,
-               ssa.OpAMD64VPSUBSWMasked128,
-               ssa.OpAMD64VPSUBSWMasked256,
-               ssa.OpAMD64VPSUBSWMasked512,
                 ssa.OpAMD64VPMADDUBSWMasked128,
                 ssa.OpAMD64VPMADDUBSWMasked256,
                 ssa.OpAMD64VPMADDUBSWMasked512,
+               ssa.OpAMD64VSCALEFPSMasked128,
+               ssa.OpAMD64VSCALEFPSMasked256,
+               ssa.OpAMD64VSCALEFPSMasked512,
+               ssa.OpAMD64VSCALEFPDMasked128,
+               ssa.OpAMD64VSCALEFPDMasked256,
+               ssa.OpAMD64VSCALEFPDMasked512,
                 ssa.OpAMD64VPSLLVWMasked128,
                 ssa.OpAMD64VPSLLVWMasked256,
                 ssa.OpAMD64VPSLLVWMasked512,
@@ -584,6 +578,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPSUBQMasked128,
                 ssa.OpAMD64VPSUBQMasked256,
                 ssa.OpAMD64VPSUBQMasked512,
+               ssa.OpAMD64VPSUBSBMasked128,
+               ssa.OpAMD64VPSUBSBMasked256,
+               ssa.OpAMD64VPSUBSBMasked512,
+               ssa.OpAMD64VPSUBSWMasked128,
+               ssa.OpAMD64VPSUBSWMasked256,
+               ssa.OpAMD64VPSUBSWMasked512,
                 ssa.OpAMD64VPXORDMasked128,
                 ssa.OpAMD64VPXORDMasked256,
                 ssa.OpAMD64VPXORDMasked512,
@@ -1085,6 +1085,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPADDQMasked128,
                 ssa.OpAMD64VPADDQMasked256,
                 ssa.OpAMD64VPADDQMasked512,
+               ssa.OpAMD64VPADDSBMasked128,
+               ssa.OpAMD64VPADDSBMasked256,
+               ssa.OpAMD64VPADDSBMasked512,
+               ssa.OpAMD64VPADDSWMasked128,
+               ssa.OpAMD64VPADDSWMasked256,
+               ssa.OpAMD64VPADDSWMasked512,
                 ssa.OpAMD64VPANDDMasked128,
                 ssa.OpAMD64VPANDDMasked256,
                 ssa.OpAMD64VPANDDMasked512,
@@ -1121,6 +1127,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VRNDSCALEPDMasked128,
                 ssa.OpAMD64VRNDSCALEPDMasked256,
                 ssa.OpAMD64VRNDSCALEPDMasked512,
+               ssa.OpAMD64VREDUCEPSMasked128,
+               ssa.OpAMD64VREDUCEPSMasked256,
+               ssa.OpAMD64VREDUCEPSMasked512,
+               ssa.OpAMD64VREDUCEPDMasked128,
+               ssa.OpAMD64VREDUCEPDMasked256,
+               ssa.OpAMD64VREDUCEPDMasked512,
                 ssa.OpAMD64VCOMPRESSPSMasked128,
                 ssa.OpAMD64VCOMPRESSPSMasked256,
                 ssa.OpAMD64VCOMPRESSPSMasked512,
@@ -1145,12 +1157,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VCVTPS2UDQMasked128,
                 ssa.OpAMD64VCVTPS2UDQMasked256,
                 ssa.OpAMD64VCVTPS2UDQMasked512,
-               ssa.OpAMD64VREDUCEPSMasked128,
-               ssa.OpAMD64VREDUCEPSMasked256,
-               ssa.OpAMD64VREDUCEPSMasked512,
-               ssa.OpAMD64VREDUCEPDMasked128,
-               ssa.OpAMD64VREDUCEPDMasked256,
-               ssa.OpAMD64VREDUCEPDMasked512,
                 ssa.OpAMD64VDIVPSMasked128,
                 ssa.OpAMD64VDIVPSMasked256,
                 ssa.OpAMD64VDIVPSMasked512,
@@ -1244,12 +1250,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMINUQMasked128,
                 ssa.OpAMD64VPMINUQMasked256,
                 ssa.OpAMD64VPMINUQMasked512,
-               ssa.OpAMD64VSCALEFPSMasked128,
-               ssa.OpAMD64VSCALEFPSMasked256,
-               ssa.OpAMD64VSCALEFPSMasked512,
-               ssa.OpAMD64VSCALEFPDMasked128,
-               ssa.OpAMD64VSCALEFPDMasked256,
-               ssa.OpAMD64VSCALEFPDMasked512,
                 ssa.OpAMD64VPMULDQMasked128,
                 ssa.OpAMD64VPMULDQMasked256,
                 ssa.OpAMD64VPMULDQMasked512,
@@ -1262,6 +1262,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMULHUWMasked128,
                 ssa.OpAMD64VPMULHUWMasked256,
                 ssa.OpAMD64VPMULHUWMasked512,
+               ssa.OpAMD64VMULPSMasked128,
+               ssa.OpAMD64VMULPSMasked256,
+               ssa.OpAMD64VMULPSMasked512,
+               ssa.OpAMD64VMULPDMasked128,
+               ssa.OpAMD64VMULPDMasked256,
+               ssa.OpAMD64VMULPDMasked512,
                 ssa.OpAMD64VPMULLWMasked128,
                 ssa.OpAMD64VPMULLWMasked256,
                 ssa.OpAMD64VPMULLWMasked512,
@@ -1271,12 +1277,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMULLQMasked128,
                 ssa.OpAMD64VPMULLQMasked256,
                 ssa.OpAMD64VPMULLQMasked512,
-               ssa.OpAMD64VMULPSMasked128,
-               ssa.OpAMD64VMULPSMasked256,
-               ssa.OpAMD64VMULPSMasked512,
-               ssa.OpAMD64VMULPDMasked128,
-               ssa.OpAMD64VMULPDMasked256,
-               ssa.OpAMD64VMULPDMasked512,
                 ssa.OpAMD64VPORDMasked128,
                 ssa.OpAMD64VPORDMasked256,
                 ssa.OpAMD64VPORDMasked512,
@@ -1357,24 +1357,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPDPWSSDSMasked128,
                 ssa.OpAMD64VPDPWSSDSMasked256,
                 ssa.OpAMD64VPDPWSSDSMasked512,
-               ssa.OpAMD64VPADDSBMasked128,
-               ssa.OpAMD64VPADDSBMasked256,
-               ssa.OpAMD64VPADDSBMasked512,
-               ssa.OpAMD64VPADDSWMasked128,
-               ssa.OpAMD64VPADDSWMasked256,
-               ssa.OpAMD64VPADDSWMasked512,
-               ssa.OpAMD64VPSUBSBMasked128,
-               ssa.OpAMD64VPSUBSBMasked256,
-               ssa.OpAMD64VPSUBSBMasked512,
-               ssa.OpAMD64VPSUBSWMasked128,
-               ssa.OpAMD64VPSUBSWMasked256,
-               ssa.OpAMD64VPSUBSWMasked512,
                 ssa.OpAMD64VPMADDUBSWMasked128,
                 ssa.OpAMD64VPMADDUBSWMasked256,
                 ssa.OpAMD64VPMADDUBSWMasked512,
                 ssa.OpAMD64VPDPBUSDSMasked128,
                 ssa.OpAMD64VPDPBUSDSMasked256,
                 ssa.OpAMD64VPDPBUSDSMasked512,
+               ssa.OpAMD64VSCALEFPSMasked128,
+               ssa.OpAMD64VSCALEFPSMasked256,
+               ssa.OpAMD64VSCALEFPSMasked512,
+               ssa.OpAMD64VSCALEFPDMasked128,
+               ssa.OpAMD64VSCALEFPDMasked256,
+               ssa.OpAMD64VSCALEFPDMasked512,
                 ssa.OpAMD64VPSHLDWMasked128,
                 ssa.OpAMD64VPSHLDWMasked256,
                 ssa.OpAMD64VPSHLDWMasked512,
@@ -1489,6 +1483,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPSUBQMasked128,
                 ssa.OpAMD64VPSUBQMasked256,
                 ssa.OpAMD64VPSUBQMasked512,
+               ssa.OpAMD64VPSUBSBMasked128,
+               ssa.OpAMD64VPSUBSBMasked256,
+               ssa.OpAMD64VPSUBSBMasked512,
+               ssa.OpAMD64VPSUBSWMasked128,
+               ssa.OpAMD64VPSUBSWMasked256,
+               ssa.OpAMD64VPSUBSWMasked512,
                 ssa.OpAMD64VPDPBUSDMasked128,
                 ssa.OpAMD64VPDPBUSDMasked256,
                 ssa.OpAMD64VPDPBUSDMasked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index 1d54cfcdbddebb656dea3786cf7fab3a2ad1bee2..060f220c7de758b3d4dee652dcd237794eb56e78 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -90,6 +90,44 @@
  (AddMaskedUint64x2 x y mask) => (VPADDQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
  (AddMaskedUint64x4 x y mask) => (VPADDQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
  (AddMaskedUint64x8 x y mask) => (VPADDQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(AddPairsFloat32x4 ...) => (VHADDPS128 ...)
+(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
+(AddPairsFloat64x2 ...) => (VHADDPD128 ...)
+(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
+(AddPairsInt16x8 ...) => (VPHADDW128 ...)
+(AddPairsInt16x16 ...) => (VPHADDW256 ...)
+(AddPairsInt32x4 ...) => (VPHADDD128 ...)
+(AddPairsInt32x8 ...) => (VPHADDD256 ...)
+(AddPairsUint16x8 ...) => (VPHADDW128 ...)
+(AddPairsUint16x16 ...) => (VPHADDW256 ...)
+(AddPairsUint32x4 ...) => (VPHADDD128 ...)
+(AddPairsUint32x8 ...) => (VPHADDD256 ...)
+(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
+(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
+(AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
+(AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
+(AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
+(AddSaturatedInt16x8 ...) => (VPADDSW128 ...)
+(AddSaturatedInt16x16 ...) => (VPADDSW256 ...)
+(AddSaturatedInt16x32 ...) => (VPADDSW512 ...)
+(AddSaturatedUint8x16 ...) => (VPADDSB128 ...)
+(AddSaturatedUint8x32 ...) => (VPADDSB256 ...)
+(AddSaturatedUint8x64 ...) => (VPADDSB512 ...)
+(AddSaturatedUint16x8 ...) => (VPADDSW128 ...)
+(AddSaturatedUint16x16 ...) => (VPADDSW256 ...)
+(AddSaturatedUint16x32 ...) => (VPADDSW512 ...)
+(AddSaturatedMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
  (AddSubFloat32x4 ...) => (VADDSUBPS128 ...)
  (AddSubFloat32x8 ...) => (VADDSUBPS256 ...)
  (AddSubFloat64x2 ...) => (VADDSUBPD128 ...)
@@ -206,18 +244,30 @@
  (CeilFloat32x8 x) => (VROUNDPS256 [2] x)
  (CeilFloat64x2 x) => (VROUNDPD128 [2] x)
  (CeilFloat64x4 x) => (VROUNDPD256 [2] x)
-(CeilWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x)
-(CeilWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x)
-(CeilWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x)
-(CeilWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x)
-(CeilWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x)
-(CeilWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x)
-(CeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(CeilScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x)
+(CeilScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x)
+(CeilScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x)
+(CeilScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x)
+(CeilScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x)
+(CeilScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x)
+(CeilScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(CeilScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x)
+(CeilScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x)
+(CeilScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x)
+(CeilScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x)
+(CeilScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x)
+(CeilScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x)
+(CeilScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
  (CompressFloat32x4 x mask) => (VCOMPRESSPSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
  (CompressFloat32x8 x mask) => (VCOMPRESSPSMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
  (CompressFloat32x16 x mask) => (VCOMPRESSPSMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
@@ -260,54 +310,6 @@
  (ConvertToUint32MaskedFloat32x4 x mask) => (VCVTPS2UDQMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
  (ConvertToUint32MaskedFloat32x8 x mask) => (VCVTPS2UDQMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
  (ConvertToUint32MaskedFloat32x16 x mask) => (VCVTPS2UDQMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x)
-(DiffWithCeilWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x)
-(DiffWithCeilWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x)
-(DiffWithCeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x)
-(DiffWithFloorWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x)
-(DiffWithFloorWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x)
-(DiffWithFloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x)
-(DiffWithRoundWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x)
-(DiffWithRoundWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
-(DiffWithRoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x)
-(DiffWithTruncWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x)
-(DiffWithTruncWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x)
-(DiffWithTruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
  (DivFloat32x4 ...) => (VDIVPS128 ...)
  (DivFloat32x8 ...) => (VDIVPS256 ...)
  (DivFloat32x16 ...) => (VDIVPS512 ...)
@@ -387,18 +389,30 @@
  (FloorFloat32x8 x) => (VROUNDPS256 [1] x)
  (FloorFloat64x2 x) => (VROUNDPD128 [1] x)
  (FloorFloat64x4 x) => (VROUNDPD256 [1] x)
-(FloorWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x)
-(FloorWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x)
-(FloorWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x)
-(FloorWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x)
-(FloorWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x)
-(FloorWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x)
-(FloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(FloorScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x)
+(FloorScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x)
+(FloorScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x)
+(FloorScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x)
+(FloorScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x)
+(FloorScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x)
+(FloorScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(FloorScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x)
+(FloorScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x)
+(FloorScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x)
+(FloorScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x)
+(FloorScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x)
+(FloorScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x)
+(FloorScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
  (FusedMultiplyAddFloat32x4 ...) => (VFMADD213PS128 ...)
  (FusedMultiplyAddFloat32x8 ...) => (VFMADD213PS256 ...)
  (FusedMultiplyAddFloat32x16 ...) => (VFMADD213PS512 ...)
@@ -849,18 +863,15 @@
  (MulFloat64x2 ...) => (VMULPD128 ...)
  (MulFloat64x4 ...) => (VMULPD256 ...)
  (MulFloat64x8 ...) => (VMULPD512 ...)
-(MulByPowOf2Float32x4 ...) => (VSCALEFPS128 ...)
-(MulByPowOf2Float32x8 ...) => (VSCALEFPS256 ...)
-(MulByPowOf2Float32x16 ...) => (VSCALEFPS512 ...)
-(MulByPowOf2Float64x2 ...) => (VSCALEFPD128 ...)
-(MulByPowOf2Float64x4 ...) => (VSCALEFPD256 ...)
-(MulByPowOf2Float64x8 ...) => (VSCALEFPD512 ...)
-(MulByPowOf2MaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MulInt16x8 ...) => (VPMULLW128 ...)
+(MulInt16x16 ...) => (VPMULLW256 ...)
+(MulInt16x32 ...) => (VPMULLW512 ...)
+(MulInt32x4 ...) => (VPMULLD128 ...)
+(MulInt32x8 ...) => (VPMULLD256 ...)
+(MulInt32x16 ...) => (VPMULLD512 ...)
+(MulInt64x2 ...) => (VPMULLQ128 ...)
+(MulInt64x4 ...) => (VPMULLQ256 ...)
+(MulInt64x8 ...) => (VPMULLQ512 ...)
  (MulEvenWidenInt32x4 ...) => (VPMULDQ128 ...)
  (MulEvenWidenInt32x8 ...) => (VPMULDQ256 ...)
  (MulEvenWidenInt64x2 ...) => (VPMULDQ128 ...)
@@ -889,30 +900,21 @@
  (MulHighMaskedUint16x8 x y mask) => (VPMULHUWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
  (MulHighMaskedUint16x16 x y mask) => (VPMULHUWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
  (MulHighMaskedUint16x32 x y mask) => (VPMULHUWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(MulLowInt16x8 ...) => (VPMULLW128 ...)
-(MulLowInt16x16 ...) => (VPMULLW256 ...)
-(MulLowInt16x32 ...) => (VPMULLW512 ...)
-(MulLowInt32x4 ...) => (VPMULLD128 ...)
-(MulLowInt32x8 ...) => (VPMULLD256 ...)
-(MulLowInt32x16 ...) => (VPMULLD512 ...)
-(MulLowInt64x2 ...) => (VPMULLQ128 ...)
-(MulLowInt64x4 ...) => (VPMULLQ256 ...)
-(MulLowInt64x8 ...) => (VPMULLQ512 ...)
-(MulLowMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(MulLowMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(MulLowMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
  (MulMaskedFloat32x4 x y mask) => (VMULPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
  (MulMaskedFloat32x8 x y mask) => (VMULPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
  (MulMaskedFloat32x16 x y mask) => (VMULPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
  (MulMaskedFloat64x2 x y mask) => (VMULPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
  (MulMaskedFloat64x4 x y mask) => (VMULPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
  (MulMaskedFloat64x8 x y mask) => (VMULPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MulMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(MulMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(MulMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(MulMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MulMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MulMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MulMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MulMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MulMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
  (NotEqualFloat32x4 x y) => (VCMPPS128 [4] x y)
  (NotEqualFloat32x8 x y) => (VCMPPS256 [4] x y)
  (NotEqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [4] x y))
@@ -1015,30 +1017,6 @@
  (PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
  (PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
  (PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(PairwiseAddFloat32x4 ...) => (VHADDPS128 ...)
-(PairwiseAddFloat32x8 ...) => (VHADDPS256 ...)
-(PairwiseAddFloat64x2 ...) => (VHADDPD128 ...)
-(PairwiseAddFloat64x4 ...) => (VHADDPD256 ...)
-(PairwiseAddInt16x8 ...) => (VPHADDW128 ...)
-(PairwiseAddInt16x16 ...) => (VPHADDW256 ...)
-(PairwiseAddInt32x4 ...) => (VPHADDD128 ...)
-(PairwiseAddInt32x8 ...) => (VPHADDD256 ...)
-(PairwiseAddUint16x8 ...) => (VPHADDW128 ...)
-(PairwiseAddUint16x16 ...) => (VPHADDW256 ...)
-(PairwiseAddUint32x4 ...) => (VPHADDD128 ...)
-(PairwiseAddUint32x8 ...) => (VPHADDD256 ...)
-(PairwiseSubFloat32x4 ...) => (VHSUBPS128 ...)
-(PairwiseSubFloat32x8 ...) => (VHSUBPS256 ...)
-(PairwiseSubFloat64x2 ...) => (VHSUBPD128 ...)
-(PairwiseSubFloat64x4 ...) => (VHSUBPD256 ...)
-(PairwiseSubInt16x8 ...) => (VPHSUBW128 ...)
-(PairwiseSubInt16x16 ...) => (VPHSUBW256 ...)
-(PairwiseSubInt32x4 ...) => (VPHSUBD128 ...)
-(PairwiseSubInt32x8 ...) => (VPHSUBD256 ...)
-(PairwiseSubUint16x8 ...) => (VPHSUBW128 ...)
-(PairwiseSubUint16x16 ...) => (VPHSUBW256 ...)
-(PairwiseSubUint32x4 ...) => (VPHSUBD128 ...)
-(PairwiseSubUint32x8 ...) => (VPHSUBD256 ...)
  (PermuteFloat32x8 ...) => (VPERMPS256 ...)
  (PermuteFloat32x16 ...) => (VPERMPS512 ...)
  (PermuteFloat64x4 ...) => (VPERMPD256 ...)
@@ -1295,76 +1273,36 @@
  (RoundFloat32x8 x) => (VROUNDPS256 [0] x)
  (RoundFloat64x2 x) => (VROUNDPD128 [0] x)
  (RoundFloat64x4 x) => (VROUNDPD256 [0] x)
-(RoundWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x)
-(RoundWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x)
-(RoundWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x)
-(RoundWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x)
-(RoundWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x)
-(RoundWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x)
-(RoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(SaturatedAddInt8x16 ...) => (VPADDSB128 ...)
-(SaturatedAddInt8x32 ...) => (VPADDSB256 ...)
-(SaturatedAddInt8x64 ...) => (VPADDSB512 ...)
-(SaturatedAddInt16x8 ...) => (VPADDSW128 ...)
-(SaturatedAddInt16x16 ...) => (VPADDSW256 ...)
-(SaturatedAddInt16x32 ...) => (VPADDSW512 ...)
-(SaturatedAddUint8x16 ...) => (VPADDSB128 ...)
-(SaturatedAddUint8x32 ...) => (VPADDSB256 ...)
-(SaturatedAddUint8x64 ...) => (VPADDSB512 ...)
-(SaturatedAddUint16x8 ...) => (VPADDSW128 ...)
-(SaturatedAddUint16x16 ...) => (VPADDSW256 ...)
-(SaturatedAddUint16x32 ...) => (VPADDSW512 ...)
+(RoundScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x)
+(RoundScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x)
+(RoundScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x)
+(RoundScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x)
+(RoundScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x)
+(RoundScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x)
+(RoundScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(RoundScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x)
+(RoundScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x)
+(RoundScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x)
+(RoundScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
+(RoundScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
+(RoundScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
+(RoundScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
  (SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...)
  (SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...)
  (SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...)
  (SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
  (SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
  (SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...)
-(SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...)
-(SaturatedPairwiseSubInt16x8 ...) => (VPHSUBSW128 ...)
-(SaturatedPairwiseSubInt16x16 ...) => (VPHSUBSW256 ...)
-(SaturatedSubInt8x16 ...) => (VPSUBSB128 ...)
-(SaturatedSubInt8x32 ...) => (VPSUBSB256 ...)
-(SaturatedSubInt8x64 ...) => (VPSUBSB512 ...)
-(SaturatedSubInt16x8 ...) => (VPSUBSW128 ...)
-(SaturatedSubInt16x16 ...) => (VPSUBSW256 ...)
-(SaturatedSubInt16x32 ...) => (VPSUBSW512 ...)
-(SaturatedSubUint8x16 ...) => (VPSUBSB128 ...)
-(SaturatedSubUint8x32 ...) => (VPSUBSB256 ...)
-(SaturatedSubUint8x64 ...) => (VPSUBSB512 ...)
-(SaturatedSubUint16x8 ...) => (VPSUBSW128 ...)
-(SaturatedSubUint16x16 ...) => (VPSUBSW256 ...)
-(SaturatedSubUint16x32 ...) => (VPSUBSW512 ...)
-(SaturatedSubMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
  (SaturatedUnsignedSignedPairDotProdUint8x16 ...) => (VPMADDUBSW128 ...)
  (SaturatedUnsignedSignedPairDotProdUint8x32 ...) => (VPMADDUBSW256 ...)
  (SaturatedUnsignedSignedPairDotProdUint8x64 ...) => (VPMADDUBSW512 ...)
@@ -1377,6 +1315,18 @@
  (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
  (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
  (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ScaleFloat32x4 ...) => (VSCALEFPS128 ...)
+(ScaleFloat32x8 ...) => (VSCALEFPS256 ...)
+(ScaleFloat32x16 ...) => (VSCALEFPS512 ...)
+(ScaleFloat64x2 ...) => (VSCALEFPD128 ...)
+(ScaleFloat64x4 ...) => (VSCALEFPD256 ...)
+(ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
+(ScaleMaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+(ScaleMaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(ScaleMaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
  (Set128Float32x8 ...) => (VINSERTF128256 ...)
  (Set128Float64x4 ...) => (VINSERTF128256 ...)
  (Set128Int8x32 ...) => (VINSERTI128256 ...)
@@ -1761,22 +1711,72 @@
  (SubMaskedUint64x2 x y mask) => (VPSUBQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
  (SubMaskedUint64x4 x y mask) => (VPSUBQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
  (SubMaskedUint64x8 x y mask) => (VPSUBQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
+(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
+(SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
+(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
+(SubPairsInt16x8 ...) => (VPHSUBW128 ...)
+(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
+(SubPairsInt32x4 ...) => (VPHSUBD128 ...)
+(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
+(SubPairsUint16x8 ...) => (VPHSUBW128 ...)
+(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
+(SubPairsUint32x4 ...) => (VPHSUBD128 ...)
+(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
+(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
+(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
+(SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
+(SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
+(SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
+(SubSaturatedInt16x8 ...) => (VPSUBSW128 ...)
+(SubSaturatedInt16x16 ...) => (VPSUBSW256 ...)
+(SubSaturatedInt16x32 ...) => (VPSUBSW512 ...)
+(SubSaturatedUint8x16 ...) => (VPSUBSB128 ...)
+(SubSaturatedUint8x32 ...) => (VPSUBSB256 ...)
+(SubSaturatedUint8x64 ...) => (VPSUBSB512 ...)
+(SubSaturatedUint16x8 ...) => (VPSUBSW128 ...)
+(SubSaturatedUint16x16 ...) => (VPSUBSW256 ...)
+(SubSaturatedUint16x32 ...) => (VPSUBSW512 ...)
+(SubSaturatedMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
  (TruncFloat32x4 x) => (VROUNDPS128 [3] x)
  (TruncFloat32x8 x) => (VROUNDPS256 [3] x)
  (TruncFloat64x2 x) => (VROUNDPD128 [3] x)
  (TruncFloat64x4 x) => (VROUNDPD256 [3] x)
-(TruncWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x)
-(TruncWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x)
-(TruncWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x)
-(TruncWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
-(TruncWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
-(TruncWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
-(TruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(TruncScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x)
+(TruncScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x)
+(TruncScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x)
+(TruncScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
+(TruncScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
+(TruncScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
+(TruncScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(TruncScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x)
+(TruncScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x)
+(TruncScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x)
+(TruncScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x)
+(TruncScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x)
+(TruncScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x)
+(TruncScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
  (UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...)
  (UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...)
  (UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index 492a994e9363cfac38723a9cb732fc7b4d2f0c7b..ea52254413f7927e21e61ee6682cec947a780fd8 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -81,6 +81,44 @@ func simdGenericOps() []opData {
                 {name: "AddMaskedUint64x2", argLength: 3, commutative: true},
                 {name: "AddMaskedUint64x4", argLength: 3, commutative: true},
                 {name: "AddMaskedUint64x8", argLength: 3, commutative: true},
+               {name: "AddPairsFloat32x4", argLength: 2, commutative: false},
+               {name: "AddPairsFloat32x8", argLength: 2, commutative: false},
+               {name: "AddPairsFloat64x2", argLength: 2, commutative: false},
+               {name: "AddPairsFloat64x4", argLength: 2, commutative: false},
+               {name: "AddPairsInt16x8", argLength: 2, commutative: false},
+               {name: "AddPairsInt16x16", argLength: 2, commutative: false},
+               {name: "AddPairsInt32x4", argLength: 2, commutative: false},
+               {name: "AddPairsInt32x8", argLength: 2, commutative: false},
+               {name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
+               {name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
+               {name: "AddPairsUint16x8", argLength: 2, commutative: false},
+               {name: "AddPairsUint16x16", argLength: 2, commutative: false},
+               {name: "AddPairsUint32x4", argLength: 2, commutative: false},
+               {name: "AddPairsUint32x8", argLength: 2, commutative: false},
+               {name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
+               {name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
+               {name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
+               {name: "AddSaturatedInt16x8", argLength: 2, commutative: true},
+               {name: "AddSaturatedInt16x16", argLength: 2, commutative: true},
+               {name: "AddSaturatedInt16x32", argLength: 2, commutative: true},
+               {name: "AddSaturatedMaskedInt8x16", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedInt8x32", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedInt8x64", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedInt16x8", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedInt16x16", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedInt16x32", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedUint8x16", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedUint8x32", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedUint8x64", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedUint16x8", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedUint16x16", argLength: 3, commutative: true},
+               {name: "AddSaturatedMaskedUint16x32", argLength: 3, commutative: true},
+               {name: "AddSaturatedUint8x16", argLength: 2, commutative: true},
+               {name: "AddSaturatedUint8x32", argLength: 2, commutative: true},
+               {name: "AddSaturatedUint8x64", argLength: 2, commutative: true},
+               {name: "AddSaturatedUint16x8", argLength: 2, commutative: true},
+               {name: "AddSaturatedUint16x16", argLength: 2, commutative: true},
+               {name: "AddSaturatedUint16x32", argLength: 2, commutative: true},
                 {name: "AddSubFloat32x4", argLength: 2, commutative: false},
                 {name: "AddSubFloat32x8", argLength: 2, commutative: false},
                 {name: "AddSubFloat64x2", argLength: 2, commutative: false},
@@ -744,18 +782,6 @@ func simdGenericOps() []opData {
                 {name: "MinUint64x2", argLength: 2, commutative: true},
                 {name: "MinUint64x4", argLength: 2, commutative: true},
                 {name: "MinUint64x8", argLength: 2, commutative: true},
-               {name: "MulByPowOf2Float32x4", argLength: 2, commutative: false},
-               {name: "MulByPowOf2Float32x8", argLength: 2, commutative: false},
-               {name: "MulByPowOf2Float32x16", argLength: 2, commutative: false},
-               {name: "MulByPowOf2Float64x2", argLength: 2, commutative: false},
-               {name: "MulByPowOf2Float64x4", argLength: 2, commutative: false},
-               {name: "MulByPowOf2Float64x8", argLength: 2, commutative: false},
-               {name: "MulByPowOf2MaskedFloat32x4", argLength: 3, commutative: false},
-               {name: "MulByPowOf2MaskedFloat32x8", argLength: 3, commutative: false},
-               {name: "MulByPowOf2MaskedFloat32x16", argLength: 3, commutative: false},
-               {name: "MulByPowOf2MaskedFloat64x2", argLength: 3, commutative: false},
-               {name: "MulByPowOf2MaskedFloat64x4", argLength: 3, commutative: false},
-               {name: "MulByPowOf2MaskedFloat64x8", argLength: 3, commutative: false},
                 {name: "MulEvenWidenInt32x4", argLength: 2, commutative: true},
                 {name: "MulEvenWidenInt32x8", argLength: 2, commutative: true},
                 {name: "MulEvenWidenInt64x2", argLength: 2, commutative: true},
@@ -790,30 +816,30 @@ func simdGenericOps() []opData {
                 {name: "MulHighUint16x8", argLength: 2, commutative: true},
                 {name: "MulHighUint16x16", argLength: 2, commutative: true},
                 {name: "MulHighUint16x32", argLength: 2, commutative: true},
-               {name: "MulLowInt16x8", argLength: 2, commutative: true},
-               {name: "MulLowInt16x16", argLength: 2, commutative: true},
-               {name: "MulLowInt16x32", argLength: 2, commutative: true},
-               {name: "MulLowInt32x4", argLength: 2, commutative: true},
-               {name: "MulLowInt32x8", argLength: 2, commutative: true},
-               {name: "MulLowInt32x16", argLength: 2, commutative: true},
-               {name: "MulLowInt64x2", argLength: 2, commutative: true},
-               {name: "MulLowInt64x4", argLength: 2, commutative: true},
-               {name: "MulLowInt64x8", argLength: 2, commutative: true},
-               {name: "MulLowMaskedInt16x8", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt16x16", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt16x32", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt32x4", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt32x8", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt32x16", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt64x2", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt64x4", argLength: 3, commutative: true},
-               {name: "MulLowMaskedInt64x8", argLength: 3, commutative: true},
+               {name: "MulInt16x8", argLength: 2, commutative: true},
+               {name: "MulInt16x16", argLength: 2, commutative: true},
+               {name: "MulInt16x32", argLength: 2, commutative: true},
+               {name: "MulInt32x4", argLength: 2, commutative: true},
+               {name: "MulInt32x8", argLength: 2, commutative: true},
+               {name: "MulInt32x16", argLength: 2, commutative: true},
+               {name: "MulInt64x2", argLength: 2, commutative: true},
+               {name: "MulInt64x4", argLength: 2, commutative: true},
+               {name: "MulInt64x8", argLength: 2, commutative: true},
                 {name: "MulMaskedFloat32x4", argLength: 3, commutative: true},
                 {name: "MulMaskedFloat32x8", argLength: 3, commutative: true},
                 {name: "MulMaskedFloat32x16", argLength: 3, commutative: true},
                 {name: "MulMaskedFloat64x2", argLength: 3, commutative: true},
                 {name: "MulMaskedFloat64x4", argLength: 3, commutative: true},
                 {name: "MulMaskedFloat64x8", argLength: 3, commutative: true},
+               {name: "MulMaskedInt16x8", argLength: 3, commutative: true},
+               {name: "MulMaskedInt16x16", argLength: 3, commutative: true},
+               {name: "MulMaskedInt16x32", argLength: 3, commutative: true},
+               {name: "MulMaskedInt32x4", argLength: 3, commutative: true},
+               {name: "MulMaskedInt32x8", argLength: 3, commutative: true},
+               {name: "MulMaskedInt32x16", argLength: 3, commutative: true},
+               {name: "MulMaskedInt64x2", argLength: 3, commutative: true},
+               {name: "MulMaskedInt64x4", argLength: 3, commutative: true},
+               {name: "MulMaskedInt64x8", argLength: 3, commutative: true},
                 {name: "NotEqualFloat32x4", argLength: 2, commutative: true},
                 {name: "NotEqualFloat32x8", argLength: 2, commutative: true},
                 {name: "NotEqualFloat32x16", argLength: 2, commutative: true},
@@ -916,30 +942,6 @@ func simdGenericOps() []opData {
                 {name: "PairDotProdMaskedInt16x8", argLength: 3, commutative: false},
                 {name: "PairDotProdMaskedInt16x16", argLength: 3, commutative: false},
                 {name: "PairDotProdMaskedInt16x32", argLength: 3, commutative: false},
-               {name: "PairwiseAddFloat32x4", argLength: 2, commutative: false},
-               {name: "PairwiseAddFloat32x8", argLength: 2, commutative: false},
-               {name: "PairwiseAddFloat64x2", argLength: 2, commutative: false},
-               {name: "PairwiseAddFloat64x4", argLength: 2, commutative: false},
-               {name: "PairwiseAddInt16x8", argLength: 2, commutative: false},
-               {name: "PairwiseAddInt16x16", argLength: 2, commutative: false},
-               {name: "PairwiseAddInt32x4", argLength: 2, commutative: false},
-               {name: "PairwiseAddInt32x8", argLength: 2, commutative: false},
-               {name: "PairwiseAddUint16x8", argLength: 2, commutative: false},
-               {name: "PairwiseAddUint16x16", argLength: 2, commutative: false},
-               {name: "PairwiseAddUint32x4", argLength: 2, commutative: false},
-               {name: "PairwiseAddUint32x8", argLength: 2, commutative: false},
-               {name: "PairwiseSubFloat32x4", argLength: 2, commutative: false},
-               {name: "PairwiseSubFloat32x8", argLength: 2, commutative: false},
-               {name: "PairwiseSubFloat64x2", argLength: 2, commutative: false},
-               {name: "PairwiseSubFloat64x4", argLength: 2, commutative: false},
-               {name: "PairwiseSubInt16x8", argLength: 2, commutative: false},
-               {name: "PairwiseSubInt16x16", argLength: 2, commutative: false},
-               {name: "PairwiseSubInt32x4", argLength: 2, commutative: false},
-               {name: "PairwiseSubInt32x8", argLength: 2, commutative: false},
-               {name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
-               {name: "PairwiseSubUint16x16", argLength: 2, commutative: false},
-               {name: "PairwiseSubUint32x4", argLength: 2, commutative: false},
-               {name: "PairwiseSubUint32x8", argLength: 2, commutative: false},
                 {name: "Permute2Float32x4", argLength: 3, commutative: false},
                 {name: "Permute2Float32x8", argLength: 3, commutative: false},
                 {name: "Permute2Float32x16", argLength: 3, commutative: false},
@@ -1154,58 +1156,6 @@ func simdGenericOps() []opData {
                 {name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false},
                 {name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false},
                 {name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false},
-               {name: "SaturatedAddInt8x16", argLength: 2, commutative: true},
-               {name: "SaturatedAddInt8x32", argLength: 2, commutative: true},
-               {name: "SaturatedAddInt8x64", argLength: 2, commutative: true},
-               {name: "SaturatedAddInt16x8", argLength: 2, commutative: true},
-               {name: "SaturatedAddInt16x16", argLength: 2, commutative: true},
-               {name: "SaturatedAddInt16x32", argLength: 2, commutative: true},
-               {name: "SaturatedAddMaskedInt8x16", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedInt8x32", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedInt8x64", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedInt16x8", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedInt16x16", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedInt16x32", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedUint8x16", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedUint8x32", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedUint8x64", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedUint16x8", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedUint16x16", argLength: 3, commutative: true},
-               {name: "SaturatedAddMaskedUint16x32", argLength: 3, commutative: true},
-               {name: "SaturatedAddUint8x16", argLength: 2, commutative: true},
-               {name: "SaturatedAddUint8x32", argLength: 2, commutative: true},
-               {name: "SaturatedAddUint8x64", argLength: 2, commutative: true},
-               {name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
-               {name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
-               {name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
-               {name: "SaturatedPairwiseAddInt16x8", argLength: 2, commutative: false},
-               {name: "SaturatedPairwiseAddInt16x16", argLength: 2, commutative: false},
-               {name: "SaturatedPairwiseSubInt16x8", argLength: 2, commutative: false},
-               {name: "SaturatedPairwiseSubInt16x16", argLength: 2, commutative: false},
-               {name: "SaturatedSubInt8x16", argLength: 2, commutative: false},
-               {name: "SaturatedSubInt8x32", argLength: 2, commutative: false},
-               {name: "SaturatedSubInt8x64", argLength: 2, commutative: false},
-               {name: "SaturatedSubInt16x8", argLength: 2, commutative: false},
-               {name: "SaturatedSubInt16x16", argLength: 2, commutative: false},
-               {name: "SaturatedSubInt16x32", argLength: 2, commutative: false},
-               {name: "SaturatedSubMaskedInt8x16", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedInt8x32", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedInt8x64", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedInt16x8", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedInt16x16", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedInt16x32", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedUint8x16", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedUint8x32", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedUint8x64", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedUint16x8", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedUint16x16", argLength: 3, commutative: false},
-               {name: "SaturatedSubMaskedUint16x32", argLength: 3, commutative: false},
-               {name: "SaturatedSubUint8x16", argLength: 2, commutative: false},
-               {name: "SaturatedSubUint8x32", argLength: 2, commutative: false},
-               {name: "SaturatedSubUint8x64", argLength: 2, commutative: false},
-               {name: "SaturatedSubUint16x8", argLength: 2, commutative: false},
-               {name: "SaturatedSubUint16x16", argLength: 2, commutative: false},
-               {name: "SaturatedSubUint16x32", argLength: 2, commutative: false},
                 {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x16", argLength: 3, commutative: false},
                 {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x32", argLength: 3, commutative: false},
                 {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x64", argLength: 3, commutative: false},
@@ -1218,6 +1168,18 @@ func simdGenericOps() []opData {
                 {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false},
                 {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false},
                 {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false},
+               {name: "ScaleFloat32x4", argLength: 2, commutative: false},
+               {name: "ScaleFloat32x8", argLength: 2, commutative: false},
+               {name: "ScaleFloat32x16", argLength: 2, commutative: false},
+               {name: "ScaleFloat64x2", argLength: 2, commutative: false},
+               {name: "ScaleFloat64x4", argLength: 2, commutative: false},
+               {name: "ScaleFloat64x8", argLength: 2, commutative: false},
+               {name: "ScaleMaskedFloat32x4", argLength: 3, commutative: false},
+               {name: "ScaleMaskedFloat32x8", argLength: 3, commutative: false},
+               {name: "ScaleMaskedFloat32x16", argLength: 3, commutative: false},
+               {name: "ScaleMaskedFloat64x2", argLength: 3, commutative: false},
+               {name: "ScaleMaskedFloat64x4", argLength: 3, commutative: false},
+               {name: "ScaleMaskedFloat64x8", argLength: 3, commutative: false},
                 {name: "ShiftAllLeftInt16x8", argLength: 2, commutative: false},
                 {name: "ShiftAllLeftInt16x16", argLength: 2, commutative: false},
                 {name: "ShiftAllLeftInt16x32", argLength: 2, commutative: false},
@@ -1500,6 +1462,44 @@ func simdGenericOps() []opData {
                 {name: "SubMaskedUint64x2", argLength: 3, commutative: false},
                 {name: "SubMaskedUint64x4", argLength: 3, commutative: false},
                 {name: "SubMaskedUint64x8", argLength: 3, commutative: false},
+               {name: "SubPairsFloat32x4", argLength: 2, commutative: false},
+               {name: "SubPairsFloat32x8", argLength: 2, commutative: false},
+               {name: "SubPairsFloat64x2", argLength: 2, commutative: false},
+               {name: "SubPairsFloat64x4", argLength: 2, commutative: false},
+               {name: "SubPairsInt16x8", argLength: 2, commutative: false},
+               {name: "SubPairsInt16x16", argLength: 2, commutative: false},
+               {name: "SubPairsInt32x4", argLength: 2, commutative: false},
+               {name: "SubPairsInt32x8", argLength: 2, commutative: false},
+               {name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
+               {name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
+               {name: "SubPairsUint16x8", argLength: 2, commutative: false},
+               {name: "SubPairsUint16x16", argLength: 2, commutative: false},
+               {name: "SubPairsUint32x4", argLength: 2, commutative: false},
+               {name: "SubPairsUint32x8", argLength: 2, commutative: false},
+               {name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
+               {name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
+               {name: "SubSaturatedInt8x64", argLength: 2, commutative: false},
+               {name: "SubSaturatedInt16x8", argLength: 2, commutative: false},
+               {name: "SubSaturatedInt16x16", argLength: 2, commutative: false},
+               {name: "SubSaturatedInt16x32", argLength: 2, commutative: false},
+               {name: "SubSaturatedMaskedInt8x16", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedInt8x32", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedInt8x64", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedInt16x8", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedInt16x16", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedInt16x32", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedUint8x16", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedUint8x32", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedUint8x64", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedUint16x8", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedUint16x16", argLength: 3, commutative: false},
+               {name: "SubSaturatedMaskedUint16x32", argLength: 3, commutative: false},
+               {name: "SubSaturatedUint8x16", argLength: 2, commutative: false},
+               {name: "SubSaturatedUint8x32", argLength: 2, commutative: false},
+               {name: "SubSaturatedUint8x64", argLength: 2, commutative: false},
+               {name: "SubSaturatedUint16x8", argLength: 2, commutative: false},
+               {name: "SubSaturatedUint16x16", argLength: 2, commutative: false},
+               {name: "SubSaturatedUint16x32", argLength: 2, commutative: false},
                 {name: "SubUint8x16", argLength: 2, commutative: false},
                 {name: "SubUint8x32", argLength: 2, commutative: false},
                 {name: "SubUint8x64", argLength: 2, commutative: false},
@@ -1558,78 +1558,54 @@ func simdGenericOps() []opData {
                 {name: "XorUint64x2", argLength: 2, commutative: true},
                 {name: "XorUint64x4", argLength: 2, commutative: true},
                 {name: "XorUint64x8", argLength: 2, commutative: true},
-               {name: "CeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "CeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithCeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithFloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithRoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "DiffWithTruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "FloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "CeilScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "FloorScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "GaloisFieldAffineTransformInverseMaskedUint8x16", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "GaloisFieldAffineTransformInverseMaskedUint8x32", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "GaloisFieldAffineTransformInverseMaskedUint8x64", argLength: 3, commutative: false, aux: "Int8"},
@@ -1708,18 +1684,30 @@ func simdGenericOps() []opData {
                 {name: "RotateAllRightUint64x2", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "RotateAllRightUint64x4", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "RotateAllRightUint64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "RoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "RoundScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "Set128Float32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "Set128Float64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "Set128Int8x32", argLength: 2, commutative: false, aux: "Int8"},
@@ -1810,17 +1798,29 @@ func simdGenericOps() []opData {
                 {name: "ShiftAllRightConcatUint64x2", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightConcatUint64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightConcatUint64x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-               {name: "TruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "TruncScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
         }
  }
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index e8a5354c00130b5df562de66f8c846db5b501d82..6dcbec2573b8823dbdbef7f8c3c341f4d2ef7016 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -4567,6 +4567,44 @@ const (
         OpAddMaskedUint64x2
         OpAddMaskedUint64x4
         OpAddMaskedUint64x8
+       OpAddPairsFloat32x4
+       OpAddPairsFloat32x8
+       OpAddPairsFloat64x2
+       OpAddPairsFloat64x4
+       OpAddPairsInt16x8
+       OpAddPairsInt16x16
+       OpAddPairsInt32x4
+       OpAddPairsInt32x8
+       OpAddPairsSaturatedInt16x8
+       OpAddPairsSaturatedInt16x16
+       OpAddPairsUint16x8
+       OpAddPairsUint16x16
+       OpAddPairsUint32x4
+       OpAddPairsUint32x8
+       OpAddSaturatedInt8x16
+       OpAddSaturatedInt8x32
+       OpAddSaturatedInt8x64
+       OpAddSaturatedInt16x8
+       OpAddSaturatedInt16x16
+       OpAddSaturatedInt16x32
+       OpAddSaturatedMaskedInt8x16
+       OpAddSaturatedMaskedInt8x32
+       OpAddSaturatedMaskedInt8x64
+       OpAddSaturatedMaskedInt16x8
+       OpAddSaturatedMaskedInt16x16
+       OpAddSaturatedMaskedInt16x32
+       OpAddSaturatedMaskedUint8x16
+       OpAddSaturatedMaskedUint8x32
+       OpAddSaturatedMaskedUint8x64
+       OpAddSaturatedMaskedUint16x8
+       OpAddSaturatedMaskedUint16x16
+       OpAddSaturatedMaskedUint16x32
+       OpAddSaturatedUint8x16
+       OpAddSaturatedUint8x32
+       OpAddSaturatedUint8x64
+       OpAddSaturatedUint16x8
+       OpAddSaturatedUint16x16
+       OpAddSaturatedUint16x32
         OpAddSubFloat32x4
         OpAddSubFloat32x8
         OpAddSubFloat64x2
@@ -5230,18 +5268,6 @@ const (
         OpMinUint64x2
         OpMinUint64x4
         OpMinUint64x8
-       OpMulByPowOf2Float32x4
-       OpMulByPowOf2Float32x8
-       OpMulByPowOf2Float32x16
-       OpMulByPowOf2Float64x2
-       OpMulByPowOf2Float64x4
-       OpMulByPowOf2Float64x8
-       OpMulByPowOf2MaskedFloat32x4
-       OpMulByPowOf2MaskedFloat32x8
-       OpMulByPowOf2MaskedFloat32x16
-       OpMulByPowOf2MaskedFloat64x2
-       OpMulByPowOf2MaskedFloat64x4
-       OpMulByPowOf2MaskedFloat64x8
         OpMulEvenWidenInt32x4
         OpMulEvenWidenInt32x8
         OpMulEvenWidenInt64x2
@@ -5276,30 +5302,30 @@ const (
         OpMulHighUint16x8
         OpMulHighUint16x16
         OpMulHighUint16x32
-       OpMulLowInt16x8
-       OpMulLowInt16x16
-       OpMulLowInt16x32
-       OpMulLowInt32x4
-       OpMulLowInt32x8
-       OpMulLowInt32x16
-       OpMulLowInt64x2
-       OpMulLowInt64x4
-       OpMulLowInt64x8
-       OpMulLowMaskedInt16x8
-       OpMulLowMaskedInt16x16
-       OpMulLowMaskedInt16x32
-       OpMulLowMaskedInt32x4
-       OpMulLowMaskedInt32x8
-       OpMulLowMaskedInt32x16
-       OpMulLowMaskedInt64x2
-       OpMulLowMaskedInt64x4
-       OpMulLowMaskedInt64x8
+       OpMulInt16x8
+       OpMulInt16x16
+       OpMulInt16x32
+       OpMulInt32x4
+       OpMulInt32x8
+       OpMulInt32x16
+       OpMulInt64x2
+       OpMulInt64x4
+       OpMulInt64x8
         OpMulMaskedFloat32x4
         OpMulMaskedFloat32x8
         OpMulMaskedFloat32x16
         OpMulMaskedFloat64x2
         OpMulMaskedFloat64x4
         OpMulMaskedFloat64x8
+       OpMulMaskedInt16x8
+       OpMulMaskedInt16x16
+       OpMulMaskedInt16x32
+       OpMulMaskedInt32x4
+       OpMulMaskedInt32x8
+       OpMulMaskedInt32x16
+       OpMulMaskedInt64x2
+       OpMulMaskedInt64x4
+       OpMulMaskedInt64x8
         OpNotEqualFloat32x4
         OpNotEqualFloat32x8
         OpNotEqualFloat32x16
@@ -5402,30 +5428,6 @@ const (
         OpPairDotProdMaskedInt16x8
         OpPairDotProdMaskedInt16x16
         OpPairDotProdMaskedInt16x32
-       OpPairwiseAddFloat32x4
-       OpPairwiseAddFloat32x8
-       OpPairwiseAddFloat64x2
-       OpPairwiseAddFloat64x4
-       OpPairwiseAddInt16x8
-       OpPairwiseAddInt16x16
-       OpPairwiseAddInt32x4
-       OpPairwiseAddInt32x8
-       OpPairwiseAddUint16x8
-       OpPairwiseAddUint16x16
-       OpPairwiseAddUint32x4
-       OpPairwiseAddUint32x8
-       OpPairwiseSubFloat32x4
-       OpPairwiseSubFloat32x8
-       OpPairwiseSubFloat64x2
-       OpPairwiseSubFloat64x4
-       OpPairwiseSubInt16x8
-       OpPairwiseSubInt16x16
-       OpPairwiseSubInt32x4
-       OpPairwiseSubInt32x8
-       OpPairwiseSubUint16x8
-       OpPairwiseSubUint16x16
-       OpPairwiseSubUint32x4
-       OpPairwiseSubUint32x8
         OpPermute2Float32x4
         OpPermute2Float32x8
         OpPermute2Float32x16
@@ -5640,58 +5642,6 @@ const (
         OpSaturatedAddDotProdMaskedInt32x4
         OpSaturatedAddDotProdMaskedInt32x8
         OpSaturatedAddDotProdMaskedInt32x16
-       OpSaturatedAddInt8x16
-       OpSaturatedAddInt8x32
-       OpSaturatedAddInt8x64
-       OpSaturatedAddInt16x8
-       OpSaturatedAddInt16x16
-       OpSaturatedAddInt16x32
-       OpSaturatedAddMaskedInt8x16
-       OpSaturatedAddMaskedInt8x32
-       OpSaturatedAddMaskedInt8x64
-       OpSaturatedAddMaskedInt16x8
-       OpSaturatedAddMaskedInt16x16
-       OpSaturatedAddMaskedInt16x32
-       OpSaturatedAddMaskedUint8x16
-       OpSaturatedAddMaskedUint8x32
-       OpSaturatedAddMaskedUint8x64
-       OpSaturatedAddMaskedUint16x8
-       OpSaturatedAddMaskedUint16x16
-       OpSaturatedAddMaskedUint16x32
-       OpSaturatedAddUint8x16
-       OpSaturatedAddUint8x32
-       OpSaturatedAddUint8x64
-       OpSaturatedAddUint16x8
-       OpSaturatedAddUint16x16
-       OpSaturatedAddUint16x32
-       OpSaturatedPairwiseAddInt16x8
-       OpSaturatedPairwiseAddInt16x16
-       OpSaturatedPairwiseSubInt16x8
-       OpSaturatedPairwiseSubInt16x16
-       OpSaturatedSubInt8x16
-       OpSaturatedSubInt8x32
-       OpSaturatedSubInt8x64
-       OpSaturatedSubInt16x8
-       OpSaturatedSubInt16x16
-       OpSaturatedSubInt16x32
-       OpSaturatedSubMaskedInt8x16
-       OpSaturatedSubMaskedInt8x32
-       OpSaturatedSubMaskedInt8x64
-       OpSaturatedSubMaskedInt16x8
-       OpSaturatedSubMaskedInt16x16
-       OpSaturatedSubMaskedInt16x32
-       OpSaturatedSubMaskedUint8x16
-       OpSaturatedSubMaskedUint8x32
-       OpSaturatedSubMaskedUint8x64
-       OpSaturatedSubMaskedUint16x8
-       OpSaturatedSubMaskedUint16x16
-       OpSaturatedSubMaskedUint16x32
-       OpSaturatedSubUint8x16
-       OpSaturatedSubUint8x32
-       OpSaturatedSubUint8x64
-       OpSaturatedSubUint16x8
-       OpSaturatedSubUint16x16
-       OpSaturatedSubUint16x32
         OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16
         OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32
         OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64
@@ -5704,6 +5654,18 @@ const (
         OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4
         OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8
         OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16
+       OpScaleFloat32x4
+       OpScaleFloat32x8
+       OpScaleFloat32x16
+       OpScaleFloat64x2
+       OpScaleFloat64x4
+       OpScaleFloat64x8
+       OpScaleMaskedFloat32x4
+       OpScaleMaskedFloat32x8
+       OpScaleMaskedFloat32x16
+       OpScaleMaskedFloat64x2
+       OpScaleMaskedFloat64x4
+       OpScaleMaskedFloat64x8
         OpShiftAllLeftInt16x8
         OpShiftAllLeftInt16x16
         OpShiftAllLeftInt16x32
@@ -5986,6 +5948,44 @@ const (
         OpSubMaskedUint64x2
         OpSubMaskedUint64x4
         OpSubMaskedUint64x8
+       OpSubPairsFloat32x4
+       OpSubPairsFloat32x8
+       OpSubPairsFloat64x2
+       OpSubPairsFloat64x4
+       OpSubPairsInt16x8
+       OpSubPairsInt16x16
+       OpSubPairsInt32x4
+       OpSubPairsInt32x8
+       OpSubPairsSaturatedInt16x8
+       OpSubPairsSaturatedInt16x16
+       OpSubPairsUint16x8
+       OpSubPairsUint16x16
+       OpSubPairsUint32x4
+       OpSubPairsUint32x8
+       OpSubSaturatedInt8x16
+       OpSubSaturatedInt8x32
+       OpSubSaturatedInt8x64
+       OpSubSaturatedInt16x8
+       OpSubSaturatedInt16x16
+       OpSubSaturatedInt16x32
+       OpSubSaturatedMaskedInt8x16
+       OpSubSaturatedMaskedInt8x32
+       OpSubSaturatedMaskedInt8x64
+       OpSubSaturatedMaskedInt16x8
+       OpSubSaturatedMaskedInt16x16
+       OpSubSaturatedMaskedInt16x32
+       OpSubSaturatedMaskedUint8x16
+       OpSubSaturatedMaskedUint8x32
+       OpSubSaturatedMaskedUint8x64
+       OpSubSaturatedMaskedUint16x8
+       OpSubSaturatedMaskedUint16x16
+       OpSubSaturatedMaskedUint16x32
+       OpSubSaturatedUint8x16
+       OpSubSaturatedUint8x32
+       OpSubSaturatedUint8x64
+       OpSubSaturatedUint16x8
+       OpSubSaturatedUint16x16
+       OpSubSaturatedUint16x32
         OpSubUint8x16
         OpSubUint8x32
         OpSubUint8x64
@@ -6044,78 +6044,54 @@ const (
         OpXorUint64x2
         OpXorUint64x4
         OpXorUint64x8
-       OpCeilWithPrecisionFloat32x4
-       OpCeilWithPrecisionFloat32x8
-       OpCeilWithPrecisionFloat32x16
-       OpCeilWithPrecisionFloat64x2
-       OpCeilWithPrecisionFloat64x4
-       OpCeilWithPrecisionFloat64x8
-       OpCeilWithPrecisionMaskedFloat32x4
-       OpCeilWithPrecisionMaskedFloat32x8
-       OpCeilWithPrecisionMaskedFloat32x16
-       OpCeilWithPrecisionMaskedFloat64x2
-       OpCeilWithPrecisionMaskedFloat64x4
-       OpCeilWithPrecisionMaskedFloat64x8
-       OpDiffWithCeilWithPrecisionFloat32x4
-       OpDiffWithCeilWithPrecisionFloat32x8
-       OpDiffWithCeilWithPrecisionFloat32x16
-       OpDiffWithCeilWithPrecisionFloat64x2
-       OpDiffWithCeilWithPrecisionFloat64x4
-       OpDiffWithCeilWithPrecisionFloat64x8
-       OpDiffWithCeilWithPrecisionMaskedFloat32x4
-       OpDiffWithCeilWithPrecisionMaskedFloat32x8
-       OpDiffWithCeilWithPrecisionMaskedFloat32x16
-       OpDiffWithCeilWithPrecisionMaskedFloat64x2
-       OpDiffWithCeilWithPrecisionMaskedFloat64x4
-       OpDiffWithCeilWithPrecisionMaskedFloat64x8
-       OpDiffWithFloorWithPrecisionFloat32x4
-       OpDiffWithFloorWithPrecisionFloat32x8
-       OpDiffWithFloorWithPrecisionFloat32x16
-       OpDiffWithFloorWithPrecisionFloat64x2
-       OpDiffWithFloorWithPrecisionFloat64x4
-       OpDiffWithFloorWithPrecisionFloat64x8
-       OpDiffWithFloorWithPrecisionMaskedFloat32x4
-       OpDiffWithFloorWithPrecisionMaskedFloat32x8
-       OpDiffWithFloorWithPrecisionMaskedFloat32x16
-       OpDiffWithFloorWithPrecisionMaskedFloat64x2
-       OpDiffWithFloorWithPrecisionMaskedFloat64x4
-       OpDiffWithFloorWithPrecisionMaskedFloat64x8
-       OpDiffWithRoundWithPrecisionFloat32x4
-       OpDiffWithRoundWithPrecisionFloat32x8
-       OpDiffWithRoundWithPrecisionFloat32x16
-       OpDiffWithRoundWithPrecisionFloat64x2
-       OpDiffWithRoundWithPrecisionFloat64x4
-       OpDiffWithRoundWithPrecisionFloat64x8
-       OpDiffWithRoundWithPrecisionMaskedFloat32x4
-       OpDiffWithRoundWithPrecisionMaskedFloat32x8
-       OpDiffWithRoundWithPrecisionMaskedFloat32x16
-       OpDiffWithRoundWithPrecisionMaskedFloat64x2
-       OpDiffWithRoundWithPrecisionMaskedFloat64x4
-       OpDiffWithRoundWithPrecisionMaskedFloat64x8
-       OpDiffWithTruncWithPrecisionFloat32x4
-       OpDiffWithTruncWithPrecisionFloat32x8
-       OpDiffWithTruncWithPrecisionFloat32x16
-       OpDiffWithTruncWithPrecisionFloat64x2
-       OpDiffWithTruncWithPrecisionFloat64x4
-       OpDiffWithTruncWithPrecisionFloat64x8
-       OpDiffWithTruncWithPrecisionMaskedFloat32x4
-       OpDiffWithTruncWithPrecisionMaskedFloat32x8
-       OpDiffWithTruncWithPrecisionMaskedFloat32x16
-       OpDiffWithTruncWithPrecisionMaskedFloat64x2
-       OpDiffWithTruncWithPrecisionMaskedFloat64x4
-       OpDiffWithTruncWithPrecisionMaskedFloat64x8
-       OpFloorWithPrecisionFloat32x4
-       OpFloorWithPrecisionFloat32x8
-       OpFloorWithPrecisionFloat32x16
-       OpFloorWithPrecisionFloat64x2
-       OpFloorWithPrecisionFloat64x4
-       OpFloorWithPrecisionFloat64x8
-       OpFloorWithPrecisionMaskedFloat32x4
-       OpFloorWithPrecisionMaskedFloat32x8
-       OpFloorWithPrecisionMaskedFloat32x16
-       OpFloorWithPrecisionMaskedFloat64x2
-       OpFloorWithPrecisionMaskedFloat64x4
-       OpFloorWithPrecisionMaskedFloat64x8
+       OpCeilScaledFloat32x4
+       OpCeilScaledFloat32x8
+       OpCeilScaledFloat32x16
+       OpCeilScaledFloat64x2
+       OpCeilScaledFloat64x4
+       OpCeilScaledFloat64x8
+       OpCeilScaledMaskedFloat32x4
+       OpCeilScaledMaskedFloat32x8
+       OpCeilScaledMaskedFloat32x16
+       OpCeilScaledMaskedFloat64x2
+       OpCeilScaledMaskedFloat64x4
+       OpCeilScaledMaskedFloat64x8
+       OpCeilScaledResidueFloat32x4
+       OpCeilScaledResidueFloat32x8
+       OpCeilScaledResidueFloat32x16
+       OpCeilScaledResidueFloat64x2
+       OpCeilScaledResidueFloat64x4
+       OpCeilScaledResidueFloat64x8
+       OpCeilScaledResidueMaskedFloat32x4
+       OpCeilScaledResidueMaskedFloat32x8
+       OpCeilScaledResidueMaskedFloat32x16
+       OpCeilScaledResidueMaskedFloat64x2
+       OpCeilScaledResidueMaskedFloat64x4
+       OpCeilScaledResidueMaskedFloat64x8
+       OpFloorScaledFloat32x4
+       OpFloorScaledFloat32x8
+       OpFloorScaledFloat32x16
+       OpFloorScaledFloat64x2
+       OpFloorScaledFloat64x4
+       OpFloorScaledFloat64x8
+       OpFloorScaledMaskedFloat32x4
+       OpFloorScaledMaskedFloat32x8
+       OpFloorScaledMaskedFloat32x16
+       OpFloorScaledMaskedFloat64x2
+       OpFloorScaledMaskedFloat64x4
+       OpFloorScaledMaskedFloat64x8
+       OpFloorScaledResidueFloat32x4
+       OpFloorScaledResidueFloat32x8
+       OpFloorScaledResidueFloat32x16
+       OpFloorScaledResidueFloat64x2
+       OpFloorScaledResidueFloat64x4
+       OpFloorScaledResidueFloat64x8
+       OpFloorScaledResidueMaskedFloat32x4
+       OpFloorScaledResidueMaskedFloat32x8
+       OpFloorScaledResidueMaskedFloat32x16
+       OpFloorScaledResidueMaskedFloat64x2
+       OpFloorScaledResidueMaskedFloat64x4
+       OpFloorScaledResidueMaskedFloat64x8
         OpGaloisFieldAffineTransformInverseMaskedUint8x16
         OpGaloisFieldAffineTransformInverseMaskedUint8x32
         OpGaloisFieldAffineTransformInverseMaskedUint8x64
@@ -6194,18 +6170,30 @@ const (
         OpRotateAllRightUint64x2
         OpRotateAllRightUint64x4
         OpRotateAllRightUint64x8
-       OpRoundWithPrecisionFloat32x4
-       OpRoundWithPrecisionFloat32x8
-       OpRoundWithPrecisionFloat32x16
-       OpRoundWithPrecisionFloat64x2
-       OpRoundWithPrecisionFloat64x4
-       OpRoundWithPrecisionFloat64x8
-       OpRoundWithPrecisionMaskedFloat32x4
-       OpRoundWithPrecisionMaskedFloat32x8
-       OpRoundWithPrecisionMaskedFloat32x16
-       OpRoundWithPrecisionMaskedFloat64x2
-       OpRoundWithPrecisionMaskedFloat64x4
-       OpRoundWithPrecisionMaskedFloat64x8
+       OpRoundScaledFloat32x4
+       OpRoundScaledFloat32x8
+       OpRoundScaledFloat32x16
+       OpRoundScaledFloat64x2
+       OpRoundScaledFloat64x4
+       OpRoundScaledFloat64x8
+       OpRoundScaledMaskedFloat32x4
+       OpRoundScaledMaskedFloat32x8
+       OpRoundScaledMaskedFloat32x16
+       OpRoundScaledMaskedFloat64x2
+       OpRoundScaledMaskedFloat64x4
+       OpRoundScaledMaskedFloat64x8
+       OpRoundScaledResidueFloat32x4
+       OpRoundScaledResidueFloat32x8
+       OpRoundScaledResidueFloat32x16
+       OpRoundScaledResidueFloat64x2
+       OpRoundScaledResidueFloat64x4
+       OpRoundScaledResidueFloat64x8
+       OpRoundScaledResidueMaskedFloat32x4
+       OpRoundScaledResidueMaskedFloat32x8
+       OpRoundScaledResidueMaskedFloat32x16
+       OpRoundScaledResidueMaskedFloat64x2
+       OpRoundScaledResidueMaskedFloat64x4
+       OpRoundScaledResidueMaskedFloat64x8
         OpSet128Float32x8
         OpSet128Float64x4
         OpSet128Int8x32
@@ -6296,18 +6284,30 @@ const (
         OpShiftAllRightConcatUint64x2
         OpShiftAllRightConcatUint64x4
         OpShiftAllRightConcatUint64x8
-       OpTruncWithPrecisionFloat32x4
-       OpTruncWithPrecisionFloat32x8
-       OpTruncWithPrecisionFloat32x16
-       OpTruncWithPrecisionFloat64x2
-       OpTruncWithPrecisionFloat64x4
-       OpTruncWithPrecisionFloat64x8
-       OpTruncWithPrecisionMaskedFloat32x4
-       OpTruncWithPrecisionMaskedFloat32x8
-       OpTruncWithPrecisionMaskedFloat32x16
-       OpTruncWithPrecisionMaskedFloat64x2
-       OpTruncWithPrecisionMaskedFloat64x4
-       OpTruncWithPrecisionMaskedFloat64x8
+       OpTruncScaledFloat32x4
+       OpTruncScaledFloat32x8
+       OpTruncScaledFloat32x16
+       OpTruncScaledFloat64x2
+       OpTruncScaledFloat64x4
+       OpTruncScaledFloat64x8
+       OpTruncScaledMaskedFloat32x4
+       OpTruncScaledMaskedFloat32x8
+       OpTruncScaledMaskedFloat32x16
+       OpTruncScaledMaskedFloat64x2
+       OpTruncScaledMaskedFloat64x4
+       OpTruncScaledMaskedFloat64x8
+       OpTruncScaledResidueFloat32x4
+       OpTruncScaledResidueFloat32x8
+       OpTruncScaledResidueFloat32x16
+       OpTruncScaledResidueFloat64x2
+       OpTruncScaledResidueFloat64x4
+       OpTruncScaledResidueFloat64x8
+       OpTruncScaledResidueMaskedFloat32x4
+       OpTruncScaledResidueMaskedFloat32x8
+       OpTruncScaledResidueMaskedFloat32x16
+       OpTruncScaledResidueMaskedFloat64x2
+       OpTruncScaledResidueMaskedFloat64x4
+       OpTruncScaledResidueMaskedFloat64x8
  )
  
  var opcodeTable = [...]opInfo{
@@ -62123,6 +62123,220 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "AddPairsFloat32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsFloat32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsFloat64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsFloat64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsInt16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsInt32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsInt32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsSaturatedInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsSaturatedInt16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsUint16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsUint32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "AddPairsUint32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:        "AddSaturatedInt8x16",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedInt8x32",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedInt8x64",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedInt16x8",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedInt16x16",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedInt16x32",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedInt8x16",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedInt8x32",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedInt8x64",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedInt16x8",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedInt16x16",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedInt16x32",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedUint8x16",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedUint8x32",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedUint8x64",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedUint16x8",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedUint16x16",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedMaskedUint16x32",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedUint8x16",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedUint8x32",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedUint8x64",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedUint16x8",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedUint16x16",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "AddSaturatedUint16x32",
+               argLen:      2,
+               commutative: true,
+               generic:     true,
+       },
         {
                 name:    "AddSubFloat32x4",
                 argLen:  2,
@@ -65693,66 +65907,6 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
-       {
-               name:    "MulByPowOf2Float32x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2Float32x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2Float32x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2Float64x2",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2Float64x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2Float64x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2MaskedFloat32x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2MaskedFloat32x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2MaskedFloat32x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2MaskedFloat64x2",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2MaskedFloat64x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "MulByPowOf2MaskedFloat64x8",
-               argLen:  3,
-               generic: true,
-       },
         {
                 name:        "MulEvenWidenInt32x4",
                 argLen:      2,
@@ -65958,113 +66112,59 @@ var opcodeTable = [...]opInfo{
                 generic:     true,
         },
         {
-               name:        "MulLowInt16x8",
+               name:        "MulInt16x8",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt16x16",
+               name:        "MulInt16x16",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt16x32",
+               name:        "MulInt16x32",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt32x4",
+               name:        "MulInt32x4",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt32x8",
+               name:        "MulInt32x8",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt32x16",
+               name:        "MulInt32x16",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt64x2",
+               name:        "MulInt64x2",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt64x4",
+               name:        "MulInt64x4",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
         {
-               name:        "MulLowInt64x8",
+               name:        "MulInt64x8",
                 argLen:      2,
                 commutative: true,
                 generic:     true,
         },
-       {
-               name:        "MulLowMaskedInt16x8",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt16x16",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt16x32",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt32x4",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt32x8",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt32x16",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt64x2",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt64x4",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "MulLowMaskedInt64x8",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
         {
                 name:        "MulMaskedFloat32x4",
                 argLen:      3,
@@ -66101,6 +66201,60 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:        "MulMaskedInt16x8",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt16x16",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt16x32",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt32x4",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt32x8",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt32x16",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt64x2",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt64x4",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
+       {
+               name:        "MulMaskedInt64x8",
+               argLen:      3,
+               commutative: true,
+               generic:     true,
+       },
         {
                 name:        "NotEqualFloat32x4",
                 argLen:      2,
@@ -66707,126 +66861,6 @@ var opcodeTable = [...]opInfo{
                 argLen:  3,
                 generic: true,
         },
-       {
-               name:    "PairwiseAddFloat32x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddFloat32x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddFloat64x2",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddFloat64x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddInt16x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddInt16x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddInt32x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddInt32x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddUint16x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddUint16x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddUint32x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseAddUint32x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubFloat32x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubFloat32x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubFloat64x2",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubFloat64x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubInt16x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubInt16x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubInt32x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubInt32x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubUint16x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubUint16x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubUint32x4",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "PairwiseSubUint32x8",
-               argLen:  2,
-               generic: true,
-       },
         {
                 name:    "Permute2Float32x4",
                 argLen:  3,
@@ -67898,349 +67932,125 @@ var opcodeTable = [...]opInfo{
                 generic: true,
         },
         {
-               name:        "SaturatedAddInt8x16",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddInt8x32",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddInt8x64",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddInt16x8",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddInt16x16",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddInt16x32",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddMaskedInt8x16",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddMaskedInt8x32",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddMaskedInt8x64",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddMaskedInt16x8",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddMaskedInt16x16",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddMaskedInt16x32",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
-       },
-       {
-               name:        "SaturatedAddMaskedUint8x16",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedPairDotProdMaskedUint8x16",
+               argLen:  3,
+               generic: true,
         },
         {
-               name:        "SaturatedAddMaskedUint8x32",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedPairDotProdMaskedUint8x32",
+               argLen:  3,
+               generic: true,
         },
         {
-               name:        "SaturatedAddMaskedUint8x64",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedPairDotProdMaskedUint8x64",
+               argLen:  3,
+               generic: true,
         },
         {
-               name:        "SaturatedAddMaskedUint16x8",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedPairDotProdUint8x16",
+               argLen:  2,
+               generic: true,
         },
         {
-               name:        "SaturatedAddMaskedUint16x16",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedPairDotProdUint8x32",
+               argLen:  2,
+               generic: true,
         },
         {
-               name:        "SaturatedAddMaskedUint16x32",
-               argLen:      3,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedPairDotProdUint8x64",
+               argLen:  2,
+               generic: true,
         },
         {
-               name:        "SaturatedAddUint8x16",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4",
+               argLen:  3,
+               generic: true,
         },
         {
-               name:        "SaturatedAddUint8x32",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8",
+               argLen:  3,
+               generic: true,
         },
         {
-               name:        "SaturatedAddUint8x64",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16",
+               argLen:  3,
+               generic: true,
         },
         {
-               name:        "SaturatedAddUint16x8",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4",
+               argLen:  4,
+               generic: true,
         },
         {
-               name:        "SaturatedAddUint16x16",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8",
+               argLen:  4,
+               generic: true,
         },
         {
-               name:        "SaturatedAddUint16x32",
-               argLen:      2,
-               commutative: true,
-               generic:     true,
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16",
+               argLen:  4,
+               generic: true,
         },
         {
-               name:    "SaturatedPairwiseAddInt16x8",
+               name:    "ScaleFloat32x4",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturatedPairwiseAddInt16x16",
+               name:    "ScaleFloat32x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturatedPairwiseSubInt16x8",
+               name:    "ScaleFloat32x16",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturatedPairwiseSubInt16x16",
+               name:    "ScaleFloat64x2",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturatedSubInt8x16",
+               name:    "ScaleFloat64x4",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturatedSubInt8x32",
+               name:    "ScaleFloat64x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturatedSubInt8x64",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubInt16x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubInt16x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubInt16x32",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubMaskedInt8x16",
+               name:    "ScaleMaskedFloat32x4",
                 argLen:  3,
                 generic: true,
         },
         {
-               name:    "SaturatedSubMaskedInt8x32",
+               name:    "ScaleMaskedFloat32x8",
                 argLen:  3,
                 generic: true,
         },
         {
-               name:    "SaturatedSubMaskedInt8x64",
+               name:    "ScaleMaskedFloat32x16",
                 argLen:  3,
                 generic: true,
         },
         {
-               name:    "SaturatedSubMaskedInt16x8",
+               name:    "ScaleMaskedFloat64x2",
                 argLen:  3,
                 generic: true,
         },
         {
-               name:    "SaturatedSubMaskedInt16x16",
+               name:    "ScaleMaskedFloat64x4",
                 argLen:  3,
                 generic: true,
         },
         {
-               name:    "SaturatedSubMaskedInt16x32",
+               name:    "ScaleMaskedFloat64x8",
                 argLen:  3,
                 generic: true,
         },
-       {
-               name:    "SaturatedSubMaskedUint8x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubMaskedUint8x32",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubMaskedUint8x64",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubMaskedUint16x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubMaskedUint16x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubMaskedUint16x32",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubUint8x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubUint8x32",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubUint8x64",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubUint16x8",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubUint16x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedSubUint16x32",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedPairDotProdMaskedUint8x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedPairDotProdMaskedUint8x32",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedPairDotProdMaskedUint8x64",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedPairDotProdUint8x16",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedPairDotProdUint8x32",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedPairDotProdUint8x64",
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4",
-               argLen:  4,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8",
-               argLen:  4,
-               generic: true,
-       },
-       {
-               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16",
-               argLen:  4,
-               generic: true,
-       },
         {
                 name:    "ShiftAllLeftInt16x8",
                 argLen:  2,
@@ -69651,6 +69461,196 @@ var opcodeTable = [...]opInfo{
                 argLen:  3,
                 generic: true,
         },
+       {
+               name:    "SubPairsFloat32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsFloat32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsFloat64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsFloat64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsInt16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsInt32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsInt32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsSaturatedInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsSaturatedInt16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsUint16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsUint32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubPairsUint32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedInt8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedInt8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedInt8x64",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedInt16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedInt16x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedInt8x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedInt8x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedInt8x64",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedInt16x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedInt16x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedInt16x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedUint8x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedUint8x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedUint8x64",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedUint16x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedUint16x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedMaskedUint16x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedUint8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedUint8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedUint8x64",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedUint16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SubSaturatedUint16x32",
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "SubUint8x16",
                 argLen:  2,
@@ -69978,433 +69978,289 @@ var opcodeTable = [...]opInfo{
                 generic:     true,
         },
         {
-               name:    "CeilWithPrecisionFloat32x4",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionFloat32x8",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionFloat32x16",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionFloat64x2",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionFloat64x4",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionFloat64x8",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionMaskedFloat32x4",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionMaskedFloat32x8",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionMaskedFloat32x16",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionMaskedFloat64x2",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionMaskedFloat64x4",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "CeilWithPrecisionMaskedFloat64x8",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionFloat32x4",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionFloat32x8",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionFloat32x16",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionFloat64x2",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionFloat64x4",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionFloat64x8",
-               auxType: auxInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionMaskedFloat32x4",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionMaskedFloat32x8",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionMaskedFloat32x16",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionMaskedFloat64x2",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionMaskedFloat64x4",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "DiffWithCeilWithPrecisionMaskedFloat64x8",
-               auxType: auxInt8,
-               argLen:  2,
-               generic: true,
-       },
-       {
-               name:    "DiffWithFloorWithPrecisionFloat32x4",
+               name:    "CeilScaledFloat32x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionFloat32x8",
+               name:    "CeilScaledFloat32x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionFloat32x16",
+               name:    "CeilScaledFloat32x16",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionFloat64x2",
+               name:    "CeilScaledFloat64x2",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionFloat64x4",
+               name:    "CeilScaledFloat64x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionFloat64x8",
+               name:    "CeilScaledFloat64x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionMaskedFloat32x4",
+               name:    "CeilScaledMaskedFloat32x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionMaskedFloat32x8",
+               name:    "CeilScaledMaskedFloat32x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionMaskedFloat32x16",
+               name:    "CeilScaledMaskedFloat32x16",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionMaskedFloat64x2",
+               name:    "CeilScaledMaskedFloat64x2",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionMaskedFloat64x4",
+               name:    "CeilScaledMaskedFloat64x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithFloorWithPrecisionMaskedFloat64x8",
+               name:    "CeilScaledMaskedFloat64x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionFloat32x4",
+               name:    "CeilScaledResidueFloat32x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionFloat32x8",
+               name:    "CeilScaledResidueFloat32x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionFloat32x16",
+               name:    "CeilScaledResidueFloat32x16",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionFloat64x2",
+               name:    "CeilScaledResidueFloat64x2",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionFloat64x4",
+               name:    "CeilScaledResidueFloat64x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionFloat64x8",
+               name:    "CeilScaledResidueFloat64x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionMaskedFloat32x4",
+               name:    "CeilScaledResidueMaskedFloat32x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionMaskedFloat32x8",
+               name:    "CeilScaledResidueMaskedFloat32x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionMaskedFloat32x16",
+               name:    "CeilScaledResidueMaskedFloat32x16",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionMaskedFloat64x2",
+               name:    "CeilScaledResidueMaskedFloat64x2",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionMaskedFloat64x4",
+               name:    "CeilScaledResidueMaskedFloat64x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithRoundWithPrecisionMaskedFloat64x8",
+               name:    "CeilScaledResidueMaskedFloat64x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionFloat32x4",
+               name:    "FloorScaledFloat32x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionFloat32x8",
+               name:    "FloorScaledFloat32x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionFloat32x16",
+               name:    "FloorScaledFloat32x16",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionFloat64x2",
+               name:    "FloorScaledFloat64x2",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionFloat64x4",
+               name:    "FloorScaledFloat64x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionFloat64x8",
+               name:    "FloorScaledFloat64x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionMaskedFloat32x4",
+               name:    "FloorScaledMaskedFloat32x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionMaskedFloat32x8",
+               name:    "FloorScaledMaskedFloat32x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionMaskedFloat32x16",
+               name:    "FloorScaledMaskedFloat32x16",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionMaskedFloat64x2",
+               name:    "FloorScaledMaskedFloat64x2",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionMaskedFloat64x4",
+               name:    "FloorScaledMaskedFloat64x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "DiffWithTruncWithPrecisionMaskedFloat64x8",
+               name:    "FloorScaledMaskedFloat64x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionFloat32x4",
+               name:    "FloorScaledResidueFloat32x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionFloat32x8",
+               name:    "FloorScaledResidueFloat32x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionFloat32x16",
+               name:    "FloorScaledResidueFloat32x16",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionFloat64x2",
+               name:    "FloorScaledResidueFloat64x2",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionFloat64x4",
+               name:    "FloorScaledResidueFloat64x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionFloat64x8",
+               name:    "FloorScaledResidueFloat64x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionMaskedFloat32x4",
+               name:    "FloorScaledResidueMaskedFloat32x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionMaskedFloat32x8",
+               name:    "FloorScaledResidueMaskedFloat32x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionMaskedFloat32x16",
+               name:    "FloorScaledResidueMaskedFloat32x16",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionMaskedFloat64x2",
+               name:    "FloorScaledResidueMaskedFloat64x2",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionMaskedFloat64x4",
+               name:    "FloorScaledResidueMaskedFloat64x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "FloorWithPrecisionMaskedFloat64x8",
+               name:    "FloorScaledResidueMaskedFloat64x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
@@ -70878,73 +70734,145 @@ var opcodeTable = [...]opInfo{
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionFloat32x4",
+               name:    "RoundScaledFloat32x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionFloat32x8",
+               name:    "RoundScaledFloat32x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionFloat32x16",
+               name:    "RoundScaledFloat32x16",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionFloat64x2",
+               name:    "RoundScaledFloat64x2",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionFloat64x4",
+               name:    "RoundScaledFloat64x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionFloat64x8",
+               name:    "RoundScaledFloat64x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionMaskedFloat32x4",
+               name:    "RoundScaledMaskedFloat32x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionMaskedFloat32x8",
+               name:    "RoundScaledMaskedFloat32x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionMaskedFloat32x16",
+               name:    "RoundScaledMaskedFloat32x16",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionMaskedFloat64x2",
+               name:    "RoundScaledMaskedFloat64x2",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionMaskedFloat64x4",
+               name:    "RoundScaledMaskedFloat64x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "RoundWithPrecisionMaskedFloat64x8",
+               name:    "RoundScaledMaskedFloat64x8",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueFloat32x4",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueFloat32x8",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueFloat32x16",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueFloat64x2",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueFloat64x4",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueFloat64x8",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueMaskedFloat32x4",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueMaskedFloat32x8",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueMaskedFloat32x16",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueMaskedFloat64x2",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueMaskedFloat64x4",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "RoundScaledResidueMaskedFloat64x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
@@ -71490,73 +71418,145 @@ var opcodeTable = [...]opInfo{
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionFloat32x4",
+               name:    "TruncScaledFloat32x4",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledFloat32x8",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledFloat32x16",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledFloat64x2",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledFloat64x4",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledFloat64x8",
+               auxType: auxInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledMaskedFloat32x4",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledMaskedFloat32x8",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledMaskedFloat32x16",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledMaskedFloat64x2",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledMaskedFloat64x4",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledMaskedFloat64x8",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "TruncScaledResidueFloat32x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionFloat32x8",
+               name:    "TruncScaledResidueFloat32x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionFloat32x16",
+               name:    "TruncScaledResidueFloat32x16",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionFloat64x2",
+               name:    "TruncScaledResidueFloat64x2",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionFloat64x4",
+               name:    "TruncScaledResidueFloat64x4",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionFloat64x8",
+               name:    "TruncScaledResidueFloat64x8",
                 auxType: auxInt8,
                 argLen:  1,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionMaskedFloat32x4",
+               name:    "TruncScaledResidueMaskedFloat32x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionMaskedFloat32x8",
+               name:    "TruncScaledResidueMaskedFloat32x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionMaskedFloat32x16",
+               name:    "TruncScaledResidueMaskedFloat32x16",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionMaskedFloat64x2",
+               name:    "TruncScaledResidueMaskedFloat64x2",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionMaskedFloat64x4",
+               name:    "TruncScaledResidueMaskedFloat64x4",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "TruncWithPrecisionMaskedFloat64x8",
+               name:    "TruncScaledResidueMaskedFloat64x8",
                 auxType: auxInt8,
                 argLen:  2,
                 generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index 82f13b43c6ee821f7dbea080a9c943ede3453bf2..a3a7ba7ed65aac36f313012be3400a27d00bed2d 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -760,9 +760,111 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpAddMaskedUint8x32(v)
         case OpAddMaskedUint8x64:
                 return rewriteValueAMD64_OpAddMaskedUint8x64(v)
+       case OpAddPairsFloat32x4:
+               v.Op = OpAMD64VHADDPS128
+               return true
+       case OpAddPairsFloat32x8:
+               v.Op = OpAMD64VHADDPS256
+               return true
+       case OpAddPairsFloat64x2:
+               v.Op = OpAMD64VHADDPD128
+               return true
+       case OpAddPairsFloat64x4:
+               v.Op = OpAMD64VHADDPD256
+               return true
+       case OpAddPairsInt16x16:
+               v.Op = OpAMD64VPHADDW256
+               return true
+       case OpAddPairsInt16x8:
+               v.Op = OpAMD64VPHADDW128
+               return true
+       case OpAddPairsInt32x4:
+               v.Op = OpAMD64VPHADDD128
+               return true
+       case OpAddPairsInt32x8:
+               v.Op = OpAMD64VPHADDD256
+               return true
+       case OpAddPairsSaturatedInt16x16:
+               v.Op = OpAMD64VPHADDSW256
+               return true
+       case OpAddPairsSaturatedInt16x8:
+               v.Op = OpAMD64VPHADDSW128
+               return true
+       case OpAddPairsUint16x16:
+               v.Op = OpAMD64VPHADDW256
+               return true
+       case OpAddPairsUint16x8:
+               v.Op = OpAMD64VPHADDW128
+               return true
+       case OpAddPairsUint32x4:
+               v.Op = OpAMD64VPHADDD128
+               return true
+       case OpAddPairsUint32x8:
+               v.Op = OpAMD64VPHADDD256
+               return true
         case OpAddPtr:
                 v.Op = OpAMD64ADDQ
                 return true
+       case OpAddSaturatedInt16x16:
+               v.Op = OpAMD64VPADDSW256
+               return true
+       case OpAddSaturatedInt16x32:
+               v.Op = OpAMD64VPADDSW512
+               return true
+       case OpAddSaturatedInt16x8:
+               v.Op = OpAMD64VPADDSW128
+               return true
+       case OpAddSaturatedInt8x16:
+               v.Op = OpAMD64VPADDSB128
+               return true
+       case OpAddSaturatedInt8x32:
+               v.Op = OpAMD64VPADDSB256
+               return true
+       case OpAddSaturatedInt8x64:
+               v.Op = OpAMD64VPADDSB512
+               return true
+       case OpAddSaturatedMaskedInt16x16:
+               return rewriteValueAMD64_OpAddSaturatedMaskedInt16x16(v)
+       case OpAddSaturatedMaskedInt16x32:
+               return rewriteValueAMD64_OpAddSaturatedMaskedInt16x32(v)
+       case OpAddSaturatedMaskedInt16x8:
+               return rewriteValueAMD64_OpAddSaturatedMaskedInt16x8(v)
+       case OpAddSaturatedMaskedInt8x16:
+               return rewriteValueAMD64_OpAddSaturatedMaskedInt8x16(v)
+       case OpAddSaturatedMaskedInt8x32:
+               return rewriteValueAMD64_OpAddSaturatedMaskedInt8x32(v)
+       case OpAddSaturatedMaskedInt8x64:
+               return rewriteValueAMD64_OpAddSaturatedMaskedInt8x64(v)
+       case OpAddSaturatedMaskedUint16x16:
+               return rewriteValueAMD64_OpAddSaturatedMaskedUint16x16(v)
+       case OpAddSaturatedMaskedUint16x32:
+               return rewriteValueAMD64_OpAddSaturatedMaskedUint16x32(v)
+       case OpAddSaturatedMaskedUint16x8:
+               return rewriteValueAMD64_OpAddSaturatedMaskedUint16x8(v)
+       case OpAddSaturatedMaskedUint8x16:
+               return rewriteValueAMD64_OpAddSaturatedMaskedUint8x16(v)
+       case OpAddSaturatedMaskedUint8x32:
+               return rewriteValueAMD64_OpAddSaturatedMaskedUint8x32(v)
+       case OpAddSaturatedMaskedUint8x64:
+               return rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v)
+       case OpAddSaturatedUint16x16:
+               v.Op = OpAMD64VPADDSW256
+               return true
+       case OpAddSaturatedUint16x32:
+               v.Op = OpAMD64VPADDSW512
+               return true
+       case OpAddSaturatedUint16x8:
+               v.Op = OpAMD64VPADDSW128
+               return true
+       case OpAddSaturatedUint8x16:
+               v.Op = OpAMD64VPADDSB128
+               return true
+       case OpAddSaturatedUint8x32:
+               v.Op = OpAMD64VPADDSB256
+               return true
+       case OpAddSaturatedUint8x64:
+               v.Op = OpAMD64VPADDSB512
+               return true
         case OpAddSubFloat32x4:
                 v.Op = OpAMD64VADDSUBPS128
                 return true
@@ -1185,30 +1287,54 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpCeilFloat64x2(v)
         case OpCeilFloat64x4:
                 return rewriteValueAMD64_OpCeilFloat64x4(v)
-       case OpCeilWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v)
-       case OpCeilWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v)
-       case OpCeilWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v)
-       case OpCeilWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v)
-       case OpCeilWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v)
-       case OpCeilWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v)
-       case OpCeilWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v)
-       case OpCeilWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v)
-       case OpCeilWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v)
-       case OpCeilWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v)
-       case OpCeilWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v)
-       case OpCeilWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v)
+       case OpCeilScaledFloat32x16:
+               return rewriteValueAMD64_OpCeilScaledFloat32x16(v)
+       case OpCeilScaledFloat32x4:
+               return rewriteValueAMD64_OpCeilScaledFloat32x4(v)
+       case OpCeilScaledFloat32x8:
+               return rewriteValueAMD64_OpCeilScaledFloat32x8(v)
+       case OpCeilScaledFloat64x2:
+               return rewriteValueAMD64_OpCeilScaledFloat64x2(v)
+       case OpCeilScaledFloat64x4:
+               return rewriteValueAMD64_OpCeilScaledFloat64x4(v)
+       case OpCeilScaledFloat64x8:
+               return rewriteValueAMD64_OpCeilScaledFloat64x8(v)
+       case OpCeilScaledMaskedFloat32x16:
+               return rewriteValueAMD64_OpCeilScaledMaskedFloat32x16(v)
+       case OpCeilScaledMaskedFloat32x4:
+               return rewriteValueAMD64_OpCeilScaledMaskedFloat32x4(v)
+       case OpCeilScaledMaskedFloat32x8:
+               return rewriteValueAMD64_OpCeilScaledMaskedFloat32x8(v)
+       case OpCeilScaledMaskedFloat64x2:
+               return rewriteValueAMD64_OpCeilScaledMaskedFloat64x2(v)
+       case OpCeilScaledMaskedFloat64x4:
+               return rewriteValueAMD64_OpCeilScaledMaskedFloat64x4(v)
+       case OpCeilScaledMaskedFloat64x8:
+               return rewriteValueAMD64_OpCeilScaledMaskedFloat64x8(v)
+       case OpCeilScaledResidueFloat32x16:
+               return rewriteValueAMD64_OpCeilScaledResidueFloat32x16(v)
+       case OpCeilScaledResidueFloat32x4:
+               return rewriteValueAMD64_OpCeilScaledResidueFloat32x4(v)
+       case OpCeilScaledResidueFloat32x8:
+               return rewriteValueAMD64_OpCeilScaledResidueFloat32x8(v)
+       case OpCeilScaledResidueFloat64x2:
+               return rewriteValueAMD64_OpCeilScaledResidueFloat64x2(v)
+       case OpCeilScaledResidueFloat64x4:
+               return rewriteValueAMD64_OpCeilScaledResidueFloat64x4(v)
+       case OpCeilScaledResidueFloat64x8:
+               return rewriteValueAMD64_OpCeilScaledResidueFloat64x8(v)
+       case OpCeilScaledResidueMaskedFloat32x16:
+               return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x16(v)
+       case OpCeilScaledResidueMaskedFloat32x4:
+               return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x4(v)
+       case OpCeilScaledResidueMaskedFloat32x8:
+               return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x8(v)
+       case OpCeilScaledResidueMaskedFloat64x2:
+               return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x2(v)
+       case OpCeilScaledResidueMaskedFloat64x4:
+               return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x4(v)
+       case OpCeilScaledResidueMaskedFloat64x8:
+               return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x8(v)
         case OpClosureCall:
                 v.Op = OpAMD64CALLclosure
                 return true
@@ -1409,102 +1535,6 @@ func rewriteValueAMD64(v *Value) bool {
         case OpCvtBoolToUint8:
                 v.Op = OpCopy
                 return true
-       case OpDiffWithCeilWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v)
-       case OpDiffWithCeilWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v)
-       case OpDiffWithCeilWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v)
-       case OpDiffWithCeilWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v)
-       case OpDiffWithCeilWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v)
-       case OpDiffWithCeilWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v)
-       case OpDiffWithCeilWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x16(v)
-       case OpDiffWithCeilWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x4(v)
-       case OpDiffWithCeilWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x8(v)
-       case OpDiffWithCeilWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x2(v)
-       case OpDiffWithCeilWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x4(v)
-       case OpDiffWithCeilWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x8(v)
-       case OpDiffWithFloorWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v)
-       case OpDiffWithFloorWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v)
-       case OpDiffWithFloorWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v)
-       case OpDiffWithFloorWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v)
-       case OpDiffWithFloorWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v)
-       case OpDiffWithFloorWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v)
-       case OpDiffWithFloorWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x16(v)
-       case OpDiffWithFloorWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x4(v)
-       case OpDiffWithFloorWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x8(v)
-       case OpDiffWithFloorWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x2(v)
-       case OpDiffWithFloorWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x4(v)
-       case OpDiffWithFloorWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x8(v)
-       case OpDiffWithRoundWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v)
-       case OpDiffWithRoundWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v)
-       case OpDiffWithRoundWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v)
-       case OpDiffWithRoundWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v)
-       case OpDiffWithRoundWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v)
-       case OpDiffWithRoundWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v)
-       case OpDiffWithRoundWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x16(v)
-       case OpDiffWithRoundWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x4(v)
-       case OpDiffWithRoundWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x8(v)
-       case OpDiffWithRoundWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x2(v)
-       case OpDiffWithRoundWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x4(v)
-       case OpDiffWithRoundWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x8(v)
-       case OpDiffWithTruncWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v)
-       case OpDiffWithTruncWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v)
-       case OpDiffWithTruncWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v)
-       case OpDiffWithTruncWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v)
-       case OpDiffWithTruncWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v)
-       case OpDiffWithTruncWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v)
-       case OpDiffWithTruncWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x16(v)
-       case OpDiffWithTruncWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x4(v)
-       case OpDiffWithTruncWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x8(v)
-       case OpDiffWithTruncWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x2(v)
-       case OpDiffWithTruncWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x4(v)
-       case OpDiffWithTruncWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x8(v)
         case OpDiv128u:
                 v.Op = OpAMD64DIVQU2
                 return true
@@ -1730,30 +1760,54 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpFloorFloat64x2(v)
         case OpFloorFloat64x4:
                 return rewriteValueAMD64_OpFloorFloat64x4(v)
-       case OpFloorWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v)
-       case OpFloorWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v)
-       case OpFloorWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v)
-       case OpFloorWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v)
-       case OpFloorWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v)
-       case OpFloorWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v)
-       case OpFloorWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v)
-       case OpFloorWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v)
-       case OpFloorWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v)
-       case OpFloorWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v)
-       case OpFloorWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v)
-       case OpFloorWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v)
+       case OpFloorScaledFloat32x16:
+               return rewriteValueAMD64_OpFloorScaledFloat32x16(v)
+       case OpFloorScaledFloat32x4:
+               return rewriteValueAMD64_OpFloorScaledFloat32x4(v)
+       case OpFloorScaledFloat32x8:
+               return rewriteValueAMD64_OpFloorScaledFloat32x8(v)
+       case OpFloorScaledFloat64x2:
+               return rewriteValueAMD64_OpFloorScaledFloat64x2(v)
+       case OpFloorScaledFloat64x4:
+               return rewriteValueAMD64_OpFloorScaledFloat64x4(v)
+       case OpFloorScaledFloat64x8:
+               return rewriteValueAMD64_OpFloorScaledFloat64x8(v)
+       case OpFloorScaledMaskedFloat32x16:
+               return rewriteValueAMD64_OpFloorScaledMaskedFloat32x16(v)
+       case OpFloorScaledMaskedFloat32x4:
+               return rewriteValueAMD64_OpFloorScaledMaskedFloat32x4(v)
+       case OpFloorScaledMaskedFloat32x8:
+               return rewriteValueAMD64_OpFloorScaledMaskedFloat32x8(v)
+       case OpFloorScaledMaskedFloat64x2:
+               return rewriteValueAMD64_OpFloorScaledMaskedFloat64x2(v)
+       case OpFloorScaledMaskedFloat64x4:
+               return rewriteValueAMD64_OpFloorScaledMaskedFloat64x4(v)
+       case OpFloorScaledMaskedFloat64x8:
+               return rewriteValueAMD64_OpFloorScaledMaskedFloat64x8(v)
+       case OpFloorScaledResidueFloat32x16:
+               return rewriteValueAMD64_OpFloorScaledResidueFloat32x16(v)
+       case OpFloorScaledResidueFloat32x4:
+               return rewriteValueAMD64_OpFloorScaledResidueFloat32x4(v)
+       case OpFloorScaledResidueFloat32x8:
+               return rewriteValueAMD64_OpFloorScaledResidueFloat32x8(v)
+       case OpFloorScaledResidueFloat64x2:
+               return rewriteValueAMD64_OpFloorScaledResidueFloat64x2(v)
+       case OpFloorScaledResidueFloat64x4:
+               return rewriteValueAMD64_OpFloorScaledResidueFloat64x4(v)
+       case OpFloorScaledResidueFloat64x8:
+               return rewriteValueAMD64_OpFloorScaledResidueFloat64x8(v)
+       case OpFloorScaledResidueMaskedFloat32x16:
+               return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x16(v)
+       case OpFloorScaledResidueMaskedFloat32x4:
+               return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x4(v)
+       case OpFloorScaledResidueMaskedFloat32x8:
+               return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x8(v)
+       case OpFloorScaledResidueMaskedFloat64x2:
+               return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x2(v)
+       case OpFloorScaledResidueMaskedFloat64x4:
+               return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x4(v)
+       case OpFloorScaledResidueMaskedFloat64x8:
+               return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v)
         case OpFusedMultiplyAddFloat32x16:
                 v.Op = OpAMD64VFMADD213PS512
                 return true
@@ -2944,36 +2998,6 @@ func rewriteValueAMD64(v *Value) bool {
         case OpMul8:
                 v.Op = OpAMD64MULL
                 return true
-       case OpMulByPowOf2Float32x16:
-               v.Op = OpAMD64VSCALEFPS512
-               return true
-       case OpMulByPowOf2Float32x4:
-               v.Op = OpAMD64VSCALEFPS128
-               return true
-       case OpMulByPowOf2Float32x8:
-               v.Op = OpAMD64VSCALEFPS256
-               return true
-       case OpMulByPowOf2Float64x2:
-               v.Op = OpAMD64VSCALEFPD128
-               return true
-       case OpMulByPowOf2Float64x4:
-               v.Op = OpAMD64VSCALEFPD256
-               return true
-       case OpMulByPowOf2Float64x8:
-               v.Op = OpAMD64VSCALEFPD512
-               return true
-       case OpMulByPowOf2MaskedFloat32x16:
-               return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x16(v)
-       case OpMulByPowOf2MaskedFloat32x4:
-               return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x4(v)
-       case OpMulByPowOf2MaskedFloat32x8:
-               return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x8(v)
-       case OpMulByPowOf2MaskedFloat64x2:
-               return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x2(v)
-       case OpMulByPowOf2MaskedFloat64x4:
-               return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x4(v)
-       case OpMulByPowOf2MaskedFloat64x8:
-               return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x8(v)
         case OpMulEvenWidenInt32x4:
                 v.Op = OpAMD64VPMULDQ128
                 return true
@@ -3064,51 +3088,33 @@ func rewriteValueAMD64(v *Value) bool {
         case OpMulHighUint16x8:
                 v.Op = OpAMD64VPMULHUW128
                 return true
-       case OpMulLowInt16x16:
+       case OpMulInt16x16:
                 v.Op = OpAMD64VPMULLW256
                 return true
-       case OpMulLowInt16x32:
+       case OpMulInt16x32:
                 v.Op = OpAMD64VPMULLW512
                 return true
-       case OpMulLowInt16x8:
+       case OpMulInt16x8:
                 v.Op = OpAMD64VPMULLW128
                 return true
-       case OpMulLowInt32x16:
+       case OpMulInt32x16:
                 v.Op = OpAMD64VPMULLD512
                 return true
-       case OpMulLowInt32x4:
+       case OpMulInt32x4:
                 v.Op = OpAMD64VPMULLD128
                 return true
-       case OpMulLowInt32x8:
+       case OpMulInt32x8:
                 v.Op = OpAMD64VPMULLD256
                 return true
-       case OpMulLowInt64x2:
+       case OpMulInt64x2:
                 v.Op = OpAMD64VPMULLQ128
                 return true
-       case OpMulLowInt64x4:
+       case OpMulInt64x4:
                 v.Op = OpAMD64VPMULLQ256
                 return true
-       case OpMulLowInt64x8:
+       case OpMulInt64x8:
                 v.Op = OpAMD64VPMULLQ512
                 return true
-       case OpMulLowMaskedInt16x16:
-               return rewriteValueAMD64_OpMulLowMaskedInt16x16(v)
-       case OpMulLowMaskedInt16x32:
-               return rewriteValueAMD64_OpMulLowMaskedInt16x32(v)
-       case OpMulLowMaskedInt16x8:
-               return rewriteValueAMD64_OpMulLowMaskedInt16x8(v)
-       case OpMulLowMaskedInt32x16:
-               return rewriteValueAMD64_OpMulLowMaskedInt32x16(v)
-       case OpMulLowMaskedInt32x4:
-               return rewriteValueAMD64_OpMulLowMaskedInt32x4(v)
-       case OpMulLowMaskedInt32x8:
-               return rewriteValueAMD64_OpMulLowMaskedInt32x8(v)
-       case OpMulLowMaskedInt64x2:
-               return rewriteValueAMD64_OpMulLowMaskedInt64x2(v)
-       case OpMulLowMaskedInt64x4:
-               return rewriteValueAMD64_OpMulLowMaskedInt64x4(v)
-       case OpMulLowMaskedInt64x8:
-               return rewriteValueAMD64_OpMulLowMaskedInt64x8(v)
         case OpMulMaskedFloat32x16:
                 return rewriteValueAMD64_OpMulMaskedFloat32x16(v)
         case OpMulMaskedFloat32x4:
@@ -3121,6 +3127,24 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpMulMaskedFloat64x4(v)
         case OpMulMaskedFloat64x8:
                 return rewriteValueAMD64_OpMulMaskedFloat64x8(v)
+       case OpMulMaskedInt16x16:
+               return rewriteValueAMD64_OpMulMaskedInt16x16(v)
+       case OpMulMaskedInt16x32:
+               return rewriteValueAMD64_OpMulMaskedInt16x32(v)
+       case OpMulMaskedInt16x8:
+               return rewriteValueAMD64_OpMulMaskedInt16x8(v)
+       case OpMulMaskedInt32x16:
+               return rewriteValueAMD64_OpMulMaskedInt32x16(v)
+       case OpMulMaskedInt32x4:
+               return rewriteValueAMD64_OpMulMaskedInt32x4(v)
+       case OpMulMaskedInt32x8:
+               return rewriteValueAMD64_OpMulMaskedInt32x8(v)
+       case OpMulMaskedInt64x2:
+               return rewriteValueAMD64_OpMulMaskedInt64x2(v)
+       case OpMulMaskedInt64x4:
+               return rewriteValueAMD64_OpMulMaskedInt64x4(v)
+       case OpMulMaskedInt64x8:
+               return rewriteValueAMD64_OpMulMaskedInt64x8(v)
         case OpNeg16:
                 v.Op = OpAMD64NEGL
                 return true
@@ -3406,78 +3430,6 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpPairDotProdMaskedInt16x32(v)
         case OpPairDotProdMaskedInt16x8:
                 return rewriteValueAMD64_OpPairDotProdMaskedInt16x8(v)
-       case OpPairwiseAddFloat32x4:
-               v.Op = OpAMD64VHADDPS128
-               return true
-       case OpPairwiseAddFloat32x8:
-               v.Op = OpAMD64VHADDPS256
-               return true
-       case OpPairwiseAddFloat64x2:
-               v.Op = OpAMD64VHADDPD128
-               return true
-       case OpPairwiseAddFloat64x4:
-               v.Op = OpAMD64VHADDPD256
-               return true
-       case OpPairwiseAddInt16x16:
-               v.Op = OpAMD64VPHADDW256
-               return true
-       case OpPairwiseAddInt16x8:
-               v.Op = OpAMD64VPHADDW128
-               return true
-       case OpPairwiseAddInt32x4:
-               v.Op = OpAMD64VPHADDD128
-               return true
-       case OpPairwiseAddInt32x8:
-               v.Op = OpAMD64VPHADDD256
-               return true
-       case OpPairwiseAddUint16x16:
-               v.Op = OpAMD64VPHADDW256
-               return true
-       case OpPairwiseAddUint16x8:
-               v.Op = OpAMD64VPHADDW128
-               return true
-       case OpPairwiseAddUint32x4:
-               v.Op = OpAMD64VPHADDD128
-               return true
-       case OpPairwiseAddUint32x8:
-               v.Op = OpAMD64VPHADDD256
-               return true
-       case OpPairwiseSubFloat32x4:
-               v.Op = OpAMD64VHSUBPS128
-               return true
-       case OpPairwiseSubFloat32x8:
-               v.Op = OpAMD64VHSUBPS256
-               return true
-       case OpPairwiseSubFloat64x2:
-               v.Op = OpAMD64VHSUBPD128
-               return true
-       case OpPairwiseSubFloat64x4:
-               v.Op = OpAMD64VHSUBPD256
-               return true
-       case OpPairwiseSubInt16x16:
-               v.Op = OpAMD64VPHSUBW256
-               return true
-       case OpPairwiseSubInt16x8:
-               v.Op = OpAMD64VPHSUBW128
-               return true
-       case OpPairwiseSubInt32x4:
-               v.Op = OpAMD64VPHSUBD128
-               return true
-       case OpPairwiseSubInt32x8:
-               v.Op = OpAMD64VPHSUBD256
-               return true
-       case OpPairwiseSubUint16x16:
-               v.Op = OpAMD64VPHSUBW256
-               return true
-       case OpPairwiseSubUint16x8:
-               v.Op = OpAMD64VPHSUBW128
-               return true
-       case OpPairwiseSubUint32x4:
-               v.Op = OpAMD64VPHSUBD128
-               return true
-       case OpPairwiseSubUint32x8:
-               v.Op = OpAMD64VPHSUBD256
-               return true
         case OpPanicBounds:
                 return rewriteValueAMD64_OpPanicBounds(v)
         case OpPermute2Float32x16:
@@ -4152,32 +4104,56 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpRoundFloat64x2(v)
         case OpRoundFloat64x4:
                 return rewriteValueAMD64_OpRoundFloat64x4(v)
+       case OpRoundScaledFloat32x16:
+               return rewriteValueAMD64_OpRoundScaledFloat32x16(v)
+       case OpRoundScaledFloat32x4:
+               return rewriteValueAMD64_OpRoundScaledFloat32x4(v)
+       case OpRoundScaledFloat32x8:
+               return rewriteValueAMD64_OpRoundScaledFloat32x8(v)
+       case OpRoundScaledFloat64x2:
+               return rewriteValueAMD64_OpRoundScaledFloat64x2(v)
+       case OpRoundScaledFloat64x4:
+               return rewriteValueAMD64_OpRoundScaledFloat64x4(v)
+       case OpRoundScaledFloat64x8:
+               return rewriteValueAMD64_OpRoundScaledFloat64x8(v)
+       case OpRoundScaledMaskedFloat32x16:
+               return rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v)
+       case OpRoundScaledMaskedFloat32x4:
+               return rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v)
+       case OpRoundScaledMaskedFloat32x8:
+               return rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v)
+       case OpRoundScaledMaskedFloat64x2:
+               return rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v)
+       case OpRoundScaledMaskedFloat64x4:
+               return rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v)
+       case OpRoundScaledMaskedFloat64x8:
+               return rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v)
+       case OpRoundScaledResidueFloat32x16:
+               return rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v)
+       case OpRoundScaledResidueFloat32x4:
+               return rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v)
+       case OpRoundScaledResidueFloat32x8:
+               return rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v)
+       case OpRoundScaledResidueFloat64x2:
+               return rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v)
+       case OpRoundScaledResidueFloat64x4:
+               return rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v)
+       case OpRoundScaledResidueFloat64x8:
+               return rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v)
+       case OpRoundScaledResidueMaskedFloat32x16:
+               return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v)
+       case OpRoundScaledResidueMaskedFloat32x4:
+               return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v)
+       case OpRoundScaledResidueMaskedFloat32x8:
+               return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v)
+       case OpRoundScaledResidueMaskedFloat64x2:
+               return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v)
+       case OpRoundScaledResidueMaskedFloat64x4:
+               return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v)
+       case OpRoundScaledResidueMaskedFloat64x8:
+               return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v)
         case OpRoundToEven:
                 return rewriteValueAMD64_OpRoundToEven(v)
-       case OpRoundWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v)
-       case OpRoundWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v)
-       case OpRoundWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v)
-       case OpRoundWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v)
-       case OpRoundWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v)
-       case OpRoundWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v)
-       case OpRoundWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v)
-       case OpRoundWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v)
-       case OpRoundWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v)
-       case OpRoundWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v)
-       case OpRoundWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v)
-       case OpRoundWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v)
         case OpRsh16Ux16:
                 return rewriteValueAMD64_OpRsh16Ux16(v)
         case OpRsh16Ux32:
@@ -4257,138 +4233,6 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v)
         case OpSaturatedAddDotProdMaskedInt32x8:
                 return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v)
-       case OpSaturatedAddInt16x16:
-               v.Op = OpAMD64VPADDSW256
-               return true
-       case OpSaturatedAddInt16x32:
-               v.Op = OpAMD64VPADDSW512
-               return true
-       case OpSaturatedAddInt16x8:
-               v.Op = OpAMD64VPADDSW128
-               return true
-       case OpSaturatedAddInt8x16:
-               v.Op = OpAMD64VPADDSB128
-               return true
-       case OpSaturatedAddInt8x32:
-               v.Op = OpAMD64VPADDSB256
-               return true
-       case OpSaturatedAddInt8x64:
-               v.Op = OpAMD64VPADDSB512
-               return true
-       case OpSaturatedAddMaskedInt16x16:
-               return rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v)
-       case OpSaturatedAddMaskedInt16x32:
-               return rewriteValueAMD64_OpSaturatedAddMaskedInt16x32(v)
-       case OpSaturatedAddMaskedInt16x8:
-               return rewriteValueAMD64_OpSaturatedAddMaskedInt16x8(v)
-       case OpSaturatedAddMaskedInt8x16:
-               return rewriteValueAMD64_OpSaturatedAddMaskedInt8x16(v)
-       case OpSaturatedAddMaskedInt8x32:
-               return rewriteValueAMD64_OpSaturatedAddMaskedInt8x32(v)
-       case OpSaturatedAddMaskedInt8x64:
-               return rewriteValueAMD64_OpSaturatedAddMaskedInt8x64(v)
-       case OpSaturatedAddMaskedUint16x16:
-               return rewriteValueAMD64_OpSaturatedAddMaskedUint16x16(v)
-       case OpSaturatedAddMaskedUint16x32:
-               return rewriteValueAMD64_OpSaturatedAddMaskedUint16x32(v)
-       case OpSaturatedAddMaskedUint16x8:
-               return rewriteValueAMD64_OpSaturatedAddMaskedUint16x8(v)
-       case OpSaturatedAddMaskedUint8x16:
-               return rewriteValueAMD64_OpSaturatedAddMaskedUint8x16(v)
-       case OpSaturatedAddMaskedUint8x32:
-               return rewriteValueAMD64_OpSaturatedAddMaskedUint8x32(v)
-       case OpSaturatedAddMaskedUint8x64:
-               return rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v)
-       case OpSaturatedAddUint16x16:
-               v.Op = OpAMD64VPADDSW256
-               return true
-       case OpSaturatedAddUint16x32:
-               v.Op = OpAMD64VPADDSW512
-               return true
-       case OpSaturatedAddUint16x8:
-               v.Op = OpAMD64VPADDSW128
-               return true
-       case OpSaturatedAddUint8x16:
-               v.Op = OpAMD64VPADDSB128
-               return true
-       case OpSaturatedAddUint8x32:
-               v.Op = OpAMD64VPADDSB256
-               return true
-       case OpSaturatedAddUint8x64:
-               v.Op = OpAMD64VPADDSB512
-               return true
-       case OpSaturatedPairwiseAddInt16x16:
-               v.Op = OpAMD64VPHADDSW256
-               return true
-       case OpSaturatedPairwiseAddInt16x8:
-               v.Op = OpAMD64VPHADDSW128
-               return true
-       case OpSaturatedPairwiseSubInt16x16:
-               v.Op = OpAMD64VPHSUBSW256
-               return true
-       case OpSaturatedPairwiseSubInt16x8:
-               v.Op = OpAMD64VPHSUBSW128
-               return true
-       case OpSaturatedSubInt16x16:
-               v.Op = OpAMD64VPSUBSW256
-               return true
-       case OpSaturatedSubInt16x32:
-               v.Op = OpAMD64VPSUBSW512
-               return true
-       case OpSaturatedSubInt16x8:
-               v.Op = OpAMD64VPSUBSW128
-               return true
-       case OpSaturatedSubInt8x16:
-               v.Op = OpAMD64VPSUBSB128
-               return true
-       case OpSaturatedSubInt8x32:
-               v.Op = OpAMD64VPSUBSB256
-               return true
-       case OpSaturatedSubInt8x64:
-               v.Op = OpAMD64VPSUBSB512
-               return true
-       case OpSaturatedSubMaskedInt16x16:
-               return rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v)
-       case OpSaturatedSubMaskedInt16x32:
-               return rewriteValueAMD64_OpSaturatedSubMaskedInt16x32(v)
-       case OpSaturatedSubMaskedInt16x8:
-               return rewriteValueAMD64_OpSaturatedSubMaskedInt16x8(v)
-       case OpSaturatedSubMaskedInt8x16:
-               return rewriteValueAMD64_OpSaturatedSubMaskedInt8x16(v)
-       case OpSaturatedSubMaskedInt8x32:
-               return rewriteValueAMD64_OpSaturatedSubMaskedInt8x32(v)
-       case OpSaturatedSubMaskedInt8x64:
-               return rewriteValueAMD64_OpSaturatedSubMaskedInt8x64(v)
-       case OpSaturatedSubMaskedUint16x16:
-               return rewriteValueAMD64_OpSaturatedSubMaskedUint16x16(v)
-       case OpSaturatedSubMaskedUint16x32:
-               return rewriteValueAMD64_OpSaturatedSubMaskedUint16x32(v)
-       case OpSaturatedSubMaskedUint16x8:
-               return rewriteValueAMD64_OpSaturatedSubMaskedUint16x8(v)
-       case OpSaturatedSubMaskedUint8x16:
-               return rewriteValueAMD64_OpSaturatedSubMaskedUint8x16(v)
-       case OpSaturatedSubMaskedUint8x32:
-               return rewriteValueAMD64_OpSaturatedSubMaskedUint8x32(v)
-       case OpSaturatedSubMaskedUint8x64:
-               return rewriteValueAMD64_OpSaturatedSubMaskedUint8x64(v)
-       case OpSaturatedSubUint16x16:
-               v.Op = OpAMD64VPSUBSW256
-               return true
-       case OpSaturatedSubUint16x32:
-               v.Op = OpAMD64VPSUBSW512
-               return true
-       case OpSaturatedSubUint16x8:
-               v.Op = OpAMD64VPSUBSW128
-               return true
-       case OpSaturatedSubUint8x16:
-               v.Op = OpAMD64VPSUBSB128
-               return true
-       case OpSaturatedSubUint8x32:
-               v.Op = OpAMD64VPSUBSB256
-               return true
-       case OpSaturatedSubUint8x64:
-               v.Op = OpAMD64VPSUBSB512
-               return true
         case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16:
                 return rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v)
         case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32:
@@ -4419,6 +4263,36 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v)
         case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8:
                 return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v)
+       case OpScaleFloat32x16:
+               v.Op = OpAMD64VSCALEFPS512
+               return true
+       case OpScaleFloat32x4:
+               v.Op = OpAMD64VSCALEFPS128
+               return true
+       case OpScaleFloat32x8:
+               v.Op = OpAMD64VSCALEFPS256
+               return true
+       case OpScaleFloat64x2:
+               v.Op = OpAMD64VSCALEFPD128
+               return true
+       case OpScaleFloat64x4:
+               v.Op = OpAMD64VSCALEFPD256
+               return true
+       case OpScaleFloat64x8:
+               v.Op = OpAMD64VSCALEFPD512
+               return true
+       case OpScaleMaskedFloat32x16:
+               return rewriteValueAMD64_OpScaleMaskedFloat32x16(v)
+       case OpScaleMaskedFloat32x4:
+               return rewriteValueAMD64_OpScaleMaskedFloat32x4(v)
+       case OpScaleMaskedFloat32x8:
+               return rewriteValueAMD64_OpScaleMaskedFloat32x8(v)
+       case OpScaleMaskedFloat64x2:
+               return rewriteValueAMD64_OpScaleMaskedFloat64x2(v)
+       case OpScaleMaskedFloat64x4:
+               return rewriteValueAMD64_OpScaleMaskedFloat64x4(v)
+       case OpScaleMaskedFloat64x8:
+               return rewriteValueAMD64_OpScaleMaskedFloat64x8(v)
         case OpSelect0:
                 return rewriteValueAMD64_OpSelect0(v)
         case OpSelect1:
@@ -5446,9 +5320,111 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpSubMaskedUint8x32(v)
         case OpSubMaskedUint8x64:
                 return rewriteValueAMD64_OpSubMaskedUint8x64(v)
+       case OpSubPairsFloat32x4:
+               v.Op = OpAMD64VHSUBPS128
+               return true
+       case OpSubPairsFloat32x8:
+               v.Op = OpAMD64VHSUBPS256
+               return true
+       case OpSubPairsFloat64x2:
+               v.Op = OpAMD64VHSUBPD128
+               return true
+       case OpSubPairsFloat64x4:
+               v.Op = OpAMD64VHSUBPD256
+               return true
+       case OpSubPairsInt16x16:
+               v.Op = OpAMD64VPHSUBW256
+               return true
+       case OpSubPairsInt16x8:
+               v.Op = OpAMD64VPHSUBW128
+               return true
+       case OpSubPairsInt32x4:
+               v.Op = OpAMD64VPHSUBD128
+               return true
+       case OpSubPairsInt32x8:
+               v.Op = OpAMD64VPHSUBD256
+               return true
+       case OpSubPairsSaturatedInt16x16:
+               v.Op = OpAMD64VPHSUBSW256
+               return true
+       case OpSubPairsSaturatedInt16x8:
+               v.Op = OpAMD64VPHSUBSW128
+               return true
+       case OpSubPairsUint16x16:
+               v.Op = OpAMD64VPHSUBW256
+               return true
+       case OpSubPairsUint16x8:
+               v.Op = OpAMD64VPHSUBW128
+               return true
+       case OpSubPairsUint32x4:
+               v.Op = OpAMD64VPHSUBD128
+               return true
+       case OpSubPairsUint32x8:
+               v.Op = OpAMD64VPHSUBD256
+               return true
         case OpSubPtr:
                 v.Op = OpAMD64SUBQ
                 return true
+       case OpSubSaturatedInt16x16:
+               v.Op = OpAMD64VPSUBSW256
+               return true
+       case OpSubSaturatedInt16x32:
+               v.Op = OpAMD64VPSUBSW512
+               return true
+       case OpSubSaturatedInt16x8:
+               v.Op = OpAMD64VPSUBSW128
+               return true
+       case OpSubSaturatedInt8x16:
+               v.Op = OpAMD64VPSUBSB128
+               return true
+       case OpSubSaturatedInt8x32:
+               v.Op = OpAMD64VPSUBSB256
+               return true
+       case OpSubSaturatedInt8x64:
+               v.Op = OpAMD64VPSUBSB512
+               return true
+       case OpSubSaturatedMaskedInt16x16:
+               return rewriteValueAMD64_OpSubSaturatedMaskedInt16x16(v)
+       case OpSubSaturatedMaskedInt16x32:
+               return rewriteValueAMD64_OpSubSaturatedMaskedInt16x32(v)
+       case OpSubSaturatedMaskedInt16x8:
+               return rewriteValueAMD64_OpSubSaturatedMaskedInt16x8(v)
+       case OpSubSaturatedMaskedInt8x16:
+               return rewriteValueAMD64_OpSubSaturatedMaskedInt8x16(v)
+       case OpSubSaturatedMaskedInt8x32:
+               return rewriteValueAMD64_OpSubSaturatedMaskedInt8x32(v)
+       case OpSubSaturatedMaskedInt8x64:
+               return rewriteValueAMD64_OpSubSaturatedMaskedInt8x64(v)
+       case OpSubSaturatedMaskedUint16x16:
+               return rewriteValueAMD64_OpSubSaturatedMaskedUint16x16(v)
+       case OpSubSaturatedMaskedUint16x32:
+               return rewriteValueAMD64_OpSubSaturatedMaskedUint16x32(v)
+       case OpSubSaturatedMaskedUint16x8:
+               return rewriteValueAMD64_OpSubSaturatedMaskedUint16x8(v)
+       case OpSubSaturatedMaskedUint8x16:
+               return rewriteValueAMD64_OpSubSaturatedMaskedUint8x16(v)
+       case OpSubSaturatedMaskedUint8x32:
+               return rewriteValueAMD64_OpSubSaturatedMaskedUint8x32(v)
+       case OpSubSaturatedMaskedUint8x64:
+               return rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v)
+       case OpSubSaturatedUint16x16:
+               v.Op = OpAMD64VPSUBSW256
+               return true
+       case OpSubSaturatedUint16x32:
+               v.Op = OpAMD64VPSUBSW512
+               return true
+       case OpSubSaturatedUint16x8:
+               v.Op = OpAMD64VPSUBSW128
+               return true
+       case OpSubSaturatedUint8x16:
+               v.Op = OpAMD64VPSUBSB128
+               return true
+       case OpSubSaturatedUint8x32:
+               v.Op = OpAMD64VPSUBSB256
+               return true
+       case OpSubSaturatedUint8x64:
+               v.Op = OpAMD64VPSUBSB512
+               return true
         case OpSubUint16x16:
                 v.Op = OpAMD64VPSUBW256
                 return true
@@ -5516,30 +5492,54 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpTruncFloat64x2(v)
         case OpTruncFloat64x4:
                 return rewriteValueAMD64_OpTruncFloat64x4(v)
-       case OpTruncWithPrecisionFloat32x16:
-               return rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v)
-       case OpTruncWithPrecisionFloat32x4:
-               return rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v)
-       case OpTruncWithPrecisionFloat32x8:
-               return rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v)
-       case OpTruncWithPrecisionFloat64x2:
-               return rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v)
-       case OpTruncWithPrecisionFloat64x4:
-               return rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v)
-       case OpTruncWithPrecisionFloat64x8:
-               return rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v)
-       case OpTruncWithPrecisionMaskedFloat32x16:
-               return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v)
-       case OpTruncWithPrecisionMaskedFloat32x4:
-               return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v)
-       case OpTruncWithPrecisionMaskedFloat32x8:
-               return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v)
-       case OpTruncWithPrecisionMaskedFloat64x2:
-               return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v)
-       case OpTruncWithPrecisionMaskedFloat64x4:
-               return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v)
-       case OpTruncWithPrecisionMaskedFloat64x8:
-               return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v)
+       case OpTruncScaledFloat32x16:
+               return rewriteValueAMD64_OpTruncScaledFloat32x16(v)
+       case OpTruncScaledFloat32x4:
+               return rewriteValueAMD64_OpTruncScaledFloat32x4(v)
+       case OpTruncScaledFloat32x8:
+               return rewriteValueAMD64_OpTruncScaledFloat32x8(v)
+       case OpTruncScaledFloat64x2:
+               return rewriteValueAMD64_OpTruncScaledFloat64x2(v)
+       case OpTruncScaledFloat64x4:
+               return rewriteValueAMD64_OpTruncScaledFloat64x4(v)
+       case OpTruncScaledFloat64x8:
+               return rewriteValueAMD64_OpTruncScaledFloat64x8(v)
+       case OpTruncScaledMaskedFloat32x16:
+               return rewriteValueAMD64_OpTruncScaledMaskedFloat32x16(v)
+       case OpTruncScaledMaskedFloat32x4:
+               return rewriteValueAMD64_OpTruncScaledMaskedFloat32x4(v)
+       case OpTruncScaledMaskedFloat32x8:
+               return rewriteValueAMD64_OpTruncScaledMaskedFloat32x8(v)
+       case OpTruncScaledMaskedFloat64x2:
+               return rewriteValueAMD64_OpTruncScaledMaskedFloat64x2(v)
+       case OpTruncScaledMaskedFloat64x4:
+               return rewriteValueAMD64_OpTruncScaledMaskedFloat64x4(v)
+       case OpTruncScaledMaskedFloat64x8:
+               return rewriteValueAMD64_OpTruncScaledMaskedFloat64x8(v)
+       case OpTruncScaledResidueFloat32x16:
+               return rewriteValueAMD64_OpTruncScaledResidueFloat32x16(v)
+       case OpTruncScaledResidueFloat32x4:
+               return rewriteValueAMD64_OpTruncScaledResidueFloat32x4(v)
+       case OpTruncScaledResidueFloat32x8:
+               return rewriteValueAMD64_OpTruncScaledResidueFloat32x8(v)
+       case OpTruncScaledResidueFloat64x2:
+               return rewriteValueAMD64_OpTruncScaledResidueFloat64x2(v)
+       case OpTruncScaledResidueFloat64x4:
+               return rewriteValueAMD64_OpTruncScaledResidueFloat64x4(v)
+       case OpTruncScaledResidueFloat64x8:
+               return rewriteValueAMD64_OpTruncScaledResidueFloat64x8(v)
+       case OpTruncScaledResidueMaskedFloat32x16:
+               return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x16(v)
+       case OpTruncScaledResidueMaskedFloat32x4:
+               return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x4(v)
+       case OpTruncScaledResidueMaskedFloat32x8:
+               return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x8(v)
+       case OpTruncScaledResidueMaskedFloat64x2:
+               return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x2(v)
+       case OpTruncScaledResidueMaskedFloat64x4:
+               return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x4(v)
+       case OpTruncScaledResidueMaskedFloat64x8:
+               return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v)
         case OpUnsignedSignedQuadDotProdAccumulateInt32x16:
                 v.Op = OpAMD64VPDPBUSD512
                 return true
@@ -29162,6 +29162,222 @@ func rewriteValueAMD64_OpAddMaskedUint8x64(v *Value) bool {
                 return true
         }
  }
+func rewriteValueAMD64_OpAddSaturatedMaskedInt16x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedInt16x16 x y mask)
+       // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt16x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedInt16x32 x y mask)
+       // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt16x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedInt16x8 x y mask)
+       // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt8x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedInt8x16 x y mask)
+       // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt8x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedInt8x32 x y mask)
+       // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt8x64(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedInt8x64 x y mask)
+       // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint16x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedUint16x16 x y mask)
+       // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint16x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedUint16x32 x y mask)
+       // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint16x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedUint16x8 x y mask)
+       // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint8x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedUint8x16 x y mask)
+       // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint8x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedUint8x32 x y mask)
+       // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (AddSaturatedMaskedUint8x64 x y mask)
+       // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPADDSBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
  func rewriteValueAMD64_OpAddr(v *Value) bool {
         v_0 := v.Args[0]
         // match: (Addr {sym} base)
@@ -30521,9 +30737,9 @@ func rewriteValueAMD64_OpCeilFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat32x16(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (CeilWithPrecisionFloat32x16 [a] x)
+       // match: (CeilScaledFloat32x16 [a] x)
         // result: (VRNDSCALEPS512 [a+2] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30534,9 +30750,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat32x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (CeilWithPrecisionFloat32x4 [a] x)
+       // match: (CeilScaledFloat32x4 [a] x)
         // result: (VRNDSCALEPS128 [a+2] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30547,9 +30763,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat32x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (CeilWithPrecisionFloat32x8 [a] x)
+       // match: (CeilScaledFloat32x8 [a] x)
         // result: (VRNDSCALEPS256 [a+2] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30560,9 +30776,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat64x2(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (CeilWithPrecisionFloat64x2 [a] x)
+       // match: (CeilScaledFloat64x2 [a] x)
         // result: (VRNDSCALEPD128 [a+2] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30573,9 +30789,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat64x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (CeilWithPrecisionFloat64x4 [a] x)
+       // match: (CeilScaledFloat64x4 [a] x)
         // result: (VRNDSCALEPD256 [a+2] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30586,9 +30802,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat64x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (CeilWithPrecisionFloat64x8 [a] x)
+       // match: (CeilScaledFloat64x8 [a] x)
         // result: (VRNDSCALEPD512 [a+2] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30599,11 +30815,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat32x16(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (CeilWithPrecisionMaskedFloat32x16 [a] x mask)
+       // match: (CeilScaledMaskedFloat32x16 [a] x mask)
         // result: (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30617,11 +30833,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat32x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (CeilWithPrecisionMaskedFloat32x4 [a] x mask)
+       // match: (CeilScaledMaskedFloat32x4 [a] x mask)
         // result: (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30635,11 +30851,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat32x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (CeilWithPrecisionMaskedFloat32x8 [a] x mask)
+       // match: (CeilScaledMaskedFloat32x8 [a] x mask)
         // result: (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30653,11 +30869,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat64x2(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (CeilWithPrecisionMaskedFloat64x2 [a] x mask)
+       // match: (CeilScaledMaskedFloat64x2 [a] x mask)
         // result: (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30671,11 +30887,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat64x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (CeilWithPrecisionMaskedFloat64x4 [a] x mask)
+       // match: (CeilScaledMaskedFloat64x4 [a] x mask)
         // result: (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30689,11 +30905,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat64x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (CeilWithPrecisionMaskedFloat64x8 [a] x mask)
+       // match: (CeilScaledMaskedFloat64x8 [a] x mask)
         // result: (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -30707,6 +30923,192 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v *Value) bool {
                 return true
         }
  }
+func rewriteValueAMD64_OpCeilScaledResidueFloat32x16(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (CeilScaledResidueFloat32x16 [a] x)
+       // result: (VREDUCEPS512 [a+2] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS512)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat32x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (CeilScaledResidueFloat32x4 [a] x)
+       // result: (VREDUCEPS128 [a+2] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS128)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat32x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (CeilScaledResidueFloat32x8 [a] x)
+       // result: (VREDUCEPS256 [a+2] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS256)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat64x2(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (CeilScaledResidueFloat64x2 [a] x)
+       // result: (VREDUCEPD128 [a+2] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD128)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat64x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (CeilScaledResidueFloat64x4 [a] x)
+       // result: (VREDUCEPD256 [a+2] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD256)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat64x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (CeilScaledResidueFloat64x8 [a] x)
+       // result: (VREDUCEPD512 [a+2] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD512)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (CeilScaledResidueMaskedFloat32x16 [a] x mask)
+       // result: (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked512)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (CeilScaledResidueMaskedFloat32x4 [a] x mask)
+       // result: (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked128)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (CeilScaledResidueMaskedFloat32x8 [a] x mask)
+       // result: (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked256)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (CeilScaledResidueMaskedFloat64x2 [a] x mask)
+       // result: (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked128)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (CeilScaledResidueMaskedFloat64x4 [a] x mask)
+       // result: (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked256)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (CeilScaledResidueMaskedFloat64x8 [a] x mask)
+       // result: (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked512)
+               v.AuxInt = int8ToAuxInt(a + 2)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
  func rewriteValueAMD64_OpCompressFloat32x16(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
@@ -32596,750 +32998,6 @@ func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithCeilWithPrecisionFloat32x16 [a] x)
-       // result: (VREDUCEPS512 [a+2] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS512)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithCeilWithPrecisionFloat32x4 [a] x)
-       // result: (VREDUCEPS128 [a+2] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS128)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithCeilWithPrecisionFloat32x8 [a] x)
-       // result: (VREDUCEPS256 [a+2] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS256)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithCeilWithPrecisionFloat64x2 [a] x)
-       // result: (VREDUCEPD128 [a+2] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD128)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithCeilWithPrecisionFloat64x4 [a] x)
-       // result: (VREDUCEPD256 [a+2] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD256)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithCeilWithPrecisionFloat64x8 [a] x)
-       // result: (VREDUCEPD512 [a+2] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD512)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x16(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithCeilWithPrecisionMaskedFloat32x16 [a] x mask)
-       // result: (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked512)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithCeilWithPrecisionMaskedFloat32x4 [a] x mask)
-       // result: (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked128)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithCeilWithPrecisionMaskedFloat32x8 [a] x mask)
-       // result: (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked256)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x2(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithCeilWithPrecisionMaskedFloat64x2 [a] x mask)
-       // result: (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked128)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithCeilWithPrecisionMaskedFloat64x4 [a] x mask)
-       // result: (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked256)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithCeilWithPrecisionMaskedFloat64x8 [a] x mask)
-       // result: (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked512)
-               v.AuxInt = int8ToAuxInt(a + 2)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithFloorWithPrecisionFloat32x16 [a] x)
-       // result: (VREDUCEPS512 [a+1] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS512)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithFloorWithPrecisionFloat32x4 [a] x)
-       // result: (VREDUCEPS128 [a+1] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS128)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithFloorWithPrecisionFloat32x8 [a] x)
-       // result: (VREDUCEPS256 [a+1] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS256)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithFloorWithPrecisionFloat64x2 [a] x)
-       // result: (VREDUCEPD128 [a+1] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD128)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithFloorWithPrecisionFloat64x4 [a] x)
-       // result: (VREDUCEPD256 [a+1] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD256)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithFloorWithPrecisionFloat64x8 [a] x)
-       // result: (VREDUCEPD512 [a+1] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD512)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x16(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithFloorWithPrecisionMaskedFloat32x16 [a] x mask)
-       // result: (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked512)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithFloorWithPrecisionMaskedFloat32x4 [a] x mask)
-       // result: (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked128)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithFloorWithPrecisionMaskedFloat32x8 [a] x mask)
-       // result: (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked256)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x2(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithFloorWithPrecisionMaskedFloat64x2 [a] x mask)
-       // result: (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked128)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithFloorWithPrecisionMaskedFloat64x4 [a] x mask)
-       // result: (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked256)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithFloorWithPrecisionMaskedFloat64x8 [a] x mask)
-       // result: (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked512)
-               v.AuxInt = int8ToAuxInt(a + 1)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithRoundWithPrecisionFloat32x16 [a] x)
-       // result: (VREDUCEPS512 [a+0] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS512)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithRoundWithPrecisionFloat32x4 [a] x)
-       // result: (VREDUCEPS128 [a+0] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS128)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithRoundWithPrecisionFloat32x8 [a] x)
-       // result: (VREDUCEPS256 [a+0] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS256)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithRoundWithPrecisionFloat64x2 [a] x)
-       // result: (VREDUCEPD128 [a+0] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD128)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithRoundWithPrecisionFloat64x4 [a] x)
-       // result: (VREDUCEPD256 [a+0] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD256)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithRoundWithPrecisionFloat64x8 [a] x)
-       // result: (VREDUCEPD512 [a+0] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD512)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x16(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithRoundWithPrecisionMaskedFloat32x16 [a] x mask)
-       // result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked512)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithRoundWithPrecisionMaskedFloat32x4 [a] x mask)
-       // result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked128)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithRoundWithPrecisionMaskedFloat32x8 [a] x mask)
-       // result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked256)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x2(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithRoundWithPrecisionMaskedFloat64x2 [a] x mask)
-       // result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked128)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithRoundWithPrecisionMaskedFloat64x4 [a] x mask)
-       // result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked256)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithRoundWithPrecisionMaskedFloat64x8 [a] x mask)
-       // result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked512)
-               v.AuxInt = int8ToAuxInt(a + 0)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithTruncWithPrecisionFloat32x16 [a] x)
-       // result: (VREDUCEPS512 [a+3] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS512)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithTruncWithPrecisionFloat32x4 [a] x)
-       // result: (VREDUCEPS128 [a+3] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS128)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithTruncWithPrecisionFloat32x8 [a] x)
-       // result: (VREDUCEPS256 [a+3] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPS256)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithTruncWithPrecisionFloat64x2 [a] x)
-       // result: (VREDUCEPD128 [a+3] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD128)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithTruncWithPrecisionFloat64x4 [a] x)
-       // result: (VREDUCEPD256 [a+3] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD256)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (DiffWithTruncWithPrecisionFloat64x8 [a] x)
-       // result: (VREDUCEPD512 [a+3] x)
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               v.reset(OpAMD64VREDUCEPD512)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x16(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithTruncWithPrecisionMaskedFloat32x16 [a] x mask)
-       // result: (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked512)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithTruncWithPrecisionMaskedFloat32x4 [a] x mask)
-       // result: (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked128)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithTruncWithPrecisionMaskedFloat32x8 [a] x mask)
-       // result: (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPSMasked256)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x2(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithTruncWithPrecisionMaskedFloat64x2 [a] x mask)
-       // result: (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked128)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x4(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithTruncWithPrecisionMaskedFloat64x4 [a] x mask)
-       // result: (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked256)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x8(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (DiffWithTruncWithPrecisionMaskedFloat64x8 [a] x mask)
-       // result: (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-       for {
-               a := auxIntToInt8(v.AuxInt)
-               x := v_0
-               mask := v_1
-               v.reset(OpAMD64VREDUCEPDMasked512)
-               v.AuxInt = int8ToAuxInt(a + 3)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg2(x, v0)
-               return true
-       }
-}
  func rewriteValueAMD64_OpDiv16(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
@@ -34731,9 +34389,9 @@ func rewriteValueAMD64_OpFloorFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat32x16(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (FloorWithPrecisionFloat32x16 [a] x)
+       // match: (FloorScaledFloat32x16 [a] x)
         // result: (VRNDSCALEPS512 [a+1] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34744,9 +34402,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat32x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (FloorWithPrecisionFloat32x4 [a] x)
+       // match: (FloorScaledFloat32x4 [a] x)
         // result: (VRNDSCALEPS128 [a+1] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34757,9 +34415,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat32x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (FloorWithPrecisionFloat32x8 [a] x)
+       // match: (FloorScaledFloat32x8 [a] x)
         // result: (VRNDSCALEPS256 [a+1] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34770,9 +34428,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat64x2(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (FloorWithPrecisionFloat64x2 [a] x)
+       // match: (FloorScaledFloat64x2 [a] x)
         // result: (VRNDSCALEPD128 [a+1] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34783,9 +34441,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat64x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (FloorWithPrecisionFloat64x4 [a] x)
+       // match: (FloorScaledFloat64x4 [a] x)
         // result: (VRNDSCALEPD256 [a+1] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34796,9 +34454,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat64x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (FloorWithPrecisionFloat64x8 [a] x)
+       // match: (FloorScaledFloat64x8 [a] x)
         // result: (VRNDSCALEPD512 [a+1] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34809,11 +34467,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat32x16(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (FloorWithPrecisionMaskedFloat32x16 [a] x mask)
+       // match: (FloorScaledMaskedFloat32x16 [a] x mask)
         // result: (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34827,11 +34485,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat32x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (FloorWithPrecisionMaskedFloat32x4 [a] x mask)
+       // match: (FloorScaledMaskedFloat32x4 [a] x mask)
         // result: (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34845,11 +34503,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat32x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (FloorWithPrecisionMaskedFloat32x8 [a] x mask)
+       // match: (FloorScaledMaskedFloat32x8 [a] x mask)
         // result: (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34863,11 +34521,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat64x2(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (FloorWithPrecisionMaskedFloat64x2 [a] x mask)
+       // match: (FloorScaledMaskedFloat64x2 [a] x mask)
         // result: (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34881,11 +34539,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat64x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (FloorWithPrecisionMaskedFloat64x4 [a] x mask)
+       // match: (FloorScaledMaskedFloat64x4 [a] x mask)
         // result: (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34899,11 +34557,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat64x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (FloorWithPrecisionMaskedFloat64x8 [a] x mask)
+       // match: (FloorScaledMaskedFloat64x8 [a] x mask)
         // result: (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -34917,6 +34575,192 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v *Value) bool {
                 return true
         }
  }
+func rewriteValueAMD64_OpFloorScaledResidueFloat32x16(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (FloorScaledResidueFloat32x16 [a] x)
+       // result: (VREDUCEPS512 [a+1] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS512)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat32x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (FloorScaledResidueFloat32x4 [a] x)
+       // result: (VREDUCEPS128 [a+1] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS128)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat32x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (FloorScaledResidueFloat32x8 [a] x)
+       // result: (VREDUCEPS256 [a+1] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS256)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat64x2(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (FloorScaledResidueFloat64x2 [a] x)
+       // result: (VREDUCEPD128 [a+1] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD128)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat64x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (FloorScaledResidueFloat64x4 [a] x)
+       // result: (VREDUCEPD256 [a+1] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD256)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat64x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (FloorScaledResidueFloat64x8 [a] x)
+       // result: (VREDUCEPD512 [a+1] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD512)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (FloorScaledResidueMaskedFloat32x16 [a] x mask)
+       // result: (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked512)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (FloorScaledResidueMaskedFloat32x4 [a] x mask)
+       // result: (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked128)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (FloorScaledResidueMaskedFloat32x8 [a] x mask)
+       // result: (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked256)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (FloorScaledResidueMaskedFloat64x2 [a] x mask)
+       // result: (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked128)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (FloorScaledResidueMaskedFloat64x4 [a] x mask)
+       // result: (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked256)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (FloorScaledResidueMaskedFloat64x8 [a] x mask)
+       // result: (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked512)
+               v.AuxInt = int8ToAuxInt(a + 1)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
  func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x16(v *Value) bool {
         v_3 := v.Args[3]
         v_2 := v.Args[2]
@@ -43583,114 +43427,6 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
         }
         return false
  }
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x16(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (MulByPowOf2MaskedFloat32x16 x y mask)
-       // result: (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VSCALEFPSMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x4(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (MulByPowOf2MaskedFloat32x4 x y mask)
-       // result: (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VSCALEFPSMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x8(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (MulByPowOf2MaskedFloat32x8 x y mask)
-       // result: (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VSCALEFPSMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x2(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (MulByPowOf2MaskedFloat64x2 x y mask)
-       // result: (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VSCALEFPDMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x4(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (MulByPowOf2MaskedFloat64x4 x y mask)
-       // result: (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VSCALEFPDMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x8(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (MulByPowOf2MaskedFloat64x8 x y mask)
-       // result: (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VSCALEFPDMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
  func rewriteValueAMD64_OpMulEvenWidenMaskedInt64x2(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
@@ -43907,270 +43643,270 @@ func rewriteValueAMD64_OpMulHighMaskedUint16x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt16x16(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedFloat32x16(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt16x16 x y mask)
-       // result: (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       // match: (MulMaskedFloat32x16 x y mask)
+       // result: (VMULPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLWMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v.reset(OpAMD64VMULPSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt16x32(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedFloat32x4(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt16x32 x y mask)
-       // result: (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       // match: (MulMaskedFloat32x4 x y mask)
+       // result: (VMULPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLWMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v.reset(OpAMD64VMULPSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt16x8(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedFloat32x8(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt16x8 x y mask)
-       // result: (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       // match: (MulMaskedFloat32x8 x y mask)
+       // result: (VMULPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLWMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v.reset(OpAMD64VMULPSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt32x16(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedFloat64x2(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt32x16 x y mask)
-       // result: (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+       // match: (MulMaskedFloat64x2 x y mask)
+       // result: (VMULPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLDMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v.reset(OpAMD64VMULPDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt32x4(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedFloat64x4(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt32x4 x y mask)
-       // result: (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+       // match: (MulMaskedFloat64x4 x y mask)
+       // result: (VMULPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLDMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v.reset(OpAMD64VMULPDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt32x8(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedFloat64x8(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt32x8 x y mask)
-       // result: (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+       // match: (MulMaskedFloat64x8 x y mask)
+       // result: (VMULPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLDMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v.reset(OpAMD64VMULPDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt64x2(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt16x16(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt64x2 x y mask)
-       // result: (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt16x16 x y mask)
+       // result: (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLQMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v.reset(OpAMD64VPMULLWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt64x4(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt16x32(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt64x4 x y mask)
-       // result: (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt16x32 x y mask)
+       // result: (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLQMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v.reset(OpAMD64VPMULLWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulLowMaskedInt64x8(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt16x8(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulLowMaskedInt64x8 x y mask)
-       // result: (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt16x8 x y mask)
+       // result: (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMULLQMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v.reset(OpAMD64VPMULLWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt32x16(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulMaskedFloat32x16 x y mask)
-       // result: (VMULPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt32x16 x y mask)
+       // result: (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VMULPSMasked512)
+               v.reset(OpAMD64VPMULLDMasked512)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt32x4(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulMaskedFloat32x4 x y mask)
-       // result: (VMULPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt32x4 x y mask)
+       // result: (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VMULPSMasked128)
+               v.reset(OpAMD64VPMULLDMasked128)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt32x8(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulMaskedFloat32x8 x y mask)
-       // result: (VMULPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt32x8 x y mask)
+       // result: (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VMULPSMasked256)
+               v.reset(OpAMD64VPMULLDMasked256)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt64x2(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulMaskedFloat64x2 x y mask)
-       // result: (VMULPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt64x2 x y mask)
+       // result: (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VMULPDMasked128)
+               v.reset(OpAMD64VPMULLQMasked128)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt64x4(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulMaskedFloat64x4 x y mask)
-       // result: (VMULPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt64x4 x y mask)
+       // result: (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VMULPDMasked256)
+               v.reset(OpAMD64VPMULLQMasked256)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpMulMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpMulMaskedInt64x8(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (MulMaskedFloat64x8 x y mask)
-       // result: (VMULPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+       // match: (MulMaskedInt64x8 x y mask)
+       // result: (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VMULPDMasked512)
+               v.reset(OpAMD64VPMULLQMasked512)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
@@ -48243,21 +47979,9 @@ func rewriteValueAMD64_OpRoundFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundToEven(v *Value) bool {
-       v_0 := v.Args[0]
-       // match: (RoundToEven x)
-       // result: (ROUNDSD [0] x)
-       for {
-               x := v_0
-               v.reset(OpAMD64ROUNDSD)
-               v.AuxInt = int8ToAuxInt(0)
-               v.AddArg(x)
-               return true
-       }
-}
-func rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat32x16(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (RoundWithPrecisionFloat32x16 [a] x)
+       // match: (RoundScaledFloat32x16 [a] x)
         // result: (VRNDSCALEPS512 [a+0] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48268,9 +47992,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat32x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (RoundWithPrecisionFloat32x4 [a] x)
+       // match: (RoundScaledFloat32x4 [a] x)
         // result: (VRNDSCALEPS128 [a+0] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48281,9 +48005,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat32x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (RoundWithPrecisionFloat32x8 [a] x)
+       // match: (RoundScaledFloat32x8 [a] x)
         // result: (VRNDSCALEPS256 [a+0] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48294,9 +48018,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat64x2(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (RoundWithPrecisionFloat64x2 [a] x)
+       // match: (RoundScaledFloat64x2 [a] x)
         // result: (VRNDSCALEPD128 [a+0] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48307,9 +48031,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat64x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (RoundWithPrecisionFloat64x4 [a] x)
+       // match: (RoundScaledFloat64x4 [a] x)
         // result: (VRNDSCALEPD256 [a+0] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48320,9 +48044,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat64x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (RoundWithPrecisionFloat64x8 [a] x)
+       // match: (RoundScaledFloat64x8 [a] x)
         // result: (VRNDSCALEPD512 [a+0] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48333,11 +48057,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (RoundWithPrecisionMaskedFloat32x16 [a] x mask)
+       // match: (RoundScaledMaskedFloat32x16 [a] x mask)
         // result: (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48351,11 +48075,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (RoundWithPrecisionMaskedFloat32x4 [a] x mask)
+       // match: (RoundScaledMaskedFloat32x4 [a] x mask)
         // result: (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48369,11 +48093,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (RoundWithPrecisionMaskedFloat32x8 [a] x mask)
+       // match: (RoundScaledMaskedFloat32x8 [a] x mask)
         // result: (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48387,11 +48111,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (RoundWithPrecisionMaskedFloat64x2 [a] x mask)
+       // match: (RoundScaledMaskedFloat64x2 [a] x mask)
         // result: (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48405,11 +48129,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (RoundWithPrecisionMaskedFloat64x4 [a] x mask)
+       // match: (RoundScaledMaskedFloat64x4 [a] x mask)
         // result: (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48423,11 +48147,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (RoundWithPrecisionMaskedFloat64x8 [a] x mask)
+       // match: (RoundScaledMaskedFloat64x8 [a] x mask)
         // result: (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -48441,6 +48165,204 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v *Value) bool {
                 return true
         }
  }
+func rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (RoundScaledResidueFloat32x16 [a] x)
+       // result: (VREDUCEPS512 [a+0] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS512)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (RoundScaledResidueFloat32x4 [a] x)
+       // result: (VREDUCEPS128 [a+0] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS128)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (RoundScaledResidueFloat32x8 [a] x)
+       // result: (VREDUCEPS256 [a+0] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS256)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (RoundScaledResidueFloat64x2 [a] x)
+       // result: (VREDUCEPD128 [a+0] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD128)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (RoundScaledResidueFloat64x4 [a] x)
+       // result: (VREDUCEPD256 [a+0] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD256)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (RoundScaledResidueFloat64x8 [a] x)
+       // result: (VREDUCEPD512 [a+0] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD512)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (RoundScaledResidueMaskedFloat32x16 [a] x mask)
+       // result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked512)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (RoundScaledResidueMaskedFloat32x4 [a] x mask)
+       // result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked128)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (RoundScaledResidueMaskedFloat32x8 [a] x mask)
+       // result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked256)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (RoundScaledResidueMaskedFloat64x2 [a] x mask)
+       // result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked128)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (RoundScaledResidueMaskedFloat64x4 [a] x mask)
+       // result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked256)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (RoundScaledResidueMaskedFloat64x8 [a] x mask)
+       // result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked512)
+               v.AuxInt = int8ToAuxInt(a + 0)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpRoundToEven(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (RoundToEven x)
+       // result: (ROUNDSD [0] x)
+       for {
+               x := v_0
+               v.reset(OpAMD64ROUNDSD)
+               v.AuxInt = int8ToAuxInt(0)
+               v.AddArg(x)
+               return true
+       }
+}
  func rewriteValueAMD64_OpRsh16Ux16(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
@@ -49829,552 +49751,228 @@ func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedInt16x16 x y mask)
-       // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSWMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt16x32(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedInt16x32 x y mask)
-       // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSWMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt16x8(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedInt16x8 x y mask)
-       // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSWMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt8x16(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedInt8x16 x y mask)
-       // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSBMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt8x32(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedInt8x32 x y mask)
-       // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSBMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt8x64(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedInt8x64 x y mask)
-       // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSBMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint16x16(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedUint16x16 x y mask)
-       // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSWMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint16x32(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedUint16x32 x y mask)
-       // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSWMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint16x8(v *Value) bool {
+func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedAddMaskedUint16x8 x y mask)
-       // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x16 x y mask)
+       // result: (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPADDSWMasked128)
+               v.reset(OpAMD64VPMADDUBSWMasked128)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedAddMaskedUint8x16(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedUint8x16 x y mask)
-       // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSBMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint8x32(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedUint8x32 x y mask)
-       // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSBMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedAddMaskedUint8x64 x y mask)
-       // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPADDSBMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v *Value) bool {
+func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedInt16x16 x y mask)
-       // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x32 x y mask)
+       // result: (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPSUBSWMasked256)
+               v.reset(OpAMD64VPMADDUBSWMasked256)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedSubMaskedInt16x32(v *Value) bool {
+func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedInt16x32 x y mask)
-       // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x64 x y mask)
+       // result: (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPSUBSWMasked512)
+               v.reset(OpAMD64VPMADDUBSWMasked512)
                 v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedSubMaskedInt16x8(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedSubMaskedInt16x8 x y mask)
-       // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPSUBSWMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt8x16(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedSubMaskedInt8x16 x y mask)
-       // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPSUBSBMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt8x32(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedSubMaskedInt8x32 x y mask)
-       // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPSUBSBMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt8x64(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedSubMaskedInt8x64 x y mask)
-       // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPSUBSBMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedUint16x16(v *Value) bool {
+func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v *Value) bool {
+       v_3 := v.Args[3]
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedUint16x16 x y mask)
-       // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask)
+       // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPSUBSWMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
                 v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
+               v.AddArg4(x, y, z, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedSubMaskedUint16x32(v *Value) bool {
+func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v *Value) bool {
+       v_3 := v.Args[3]
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedUint16x32 x y mask)
-       // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask)
+       // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPSUBSWMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
                 v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
+               v.AddArg4(x, y, z, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedSubMaskedUint16x8(v *Value) bool {
+func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v *Value) bool {
+       v_3 := v.Args[3]
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedUint16x8 x y mask)
-       // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask)
+       // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
-               mask := v_2
-               v.reset(OpAMD64VPSUBSWMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
                 v0.AddArg(mask)
-               v.AddArg3(x, y, v0)
+               v.AddArg4(x, y, z, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedSubMaskedUint8x16(v *Value) bool {
+func rewriteValueAMD64_OpScaleMaskedFloat32x16(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedUint8x16 x y mask)
-       // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+       // match: (ScaleMaskedFloat32x16 x y mask)
+       // result: (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPSUBSBMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v.reset(OpAMD64VSCALEFPSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedSubMaskedUint8x32(v *Value) bool {
+func rewriteValueAMD64_OpScaleMaskedFloat32x4(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedUint8x32 x y mask)
-       // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+       // match: (ScaleMaskedFloat32x4 x y mask)
+       // result: (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPSUBSBMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v.reset(OpAMD64VSCALEFPSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedSubMaskedUint8x64(v *Value) bool {
+func rewriteValueAMD64_OpScaleMaskedFloat32x8(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedSubMaskedUint8x64 x y mask)
-       // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+       // match: (ScaleMaskedFloat32x8 x y mask)
+       // result: (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPSUBSBMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v.reset(OpAMD64VSCALEFPSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v *Value) bool {
+func rewriteValueAMD64_OpScaleMaskedFloat64x2(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x16 x y mask)
-       // result: (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       // match: (ScaleMaskedFloat64x2 x y mask)
+       // result: (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMADDUBSWMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v.reset(OpAMD64VSCALEFPDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32(v *Value) bool {
+func rewriteValueAMD64_OpScaleMaskedFloat64x4(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x32 x y mask)
-       // result: (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       // match: (ScaleMaskedFloat64x4 x y mask)
+       // result: (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMADDUBSWMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v.reset(OpAMD64VSCALEFPDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64(v *Value) bool {
+func rewriteValueAMD64_OpScaleMaskedFloat64x8(v *Value) bool {
         v_2 := v.Args[2]
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x64 x y mask)
-       // result: (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       // match: (ScaleMaskedFloat64x8 x y mask)
+       // result: (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
         for {
                 x := v_0
                 y := v_1
                 mask := v_2
-               v.reset(OpAMD64VPMADDUBSWMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v.reset(OpAMD64VSCALEFPDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
                 v0.AddArg(mask)
                 v.AddArg3(x, y, v0)
                 return true
         }
  }
-func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask)
-       // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               z := v_2
-               mask := v_3
-               v.reset(OpAMD64VPDPBUSDSMasked512)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg4(x, y, z, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask)
-       // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               z := v_2
-               mask := v_3
-               v.reset(OpAMD64VPDPBUSDSMasked128)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg4(x, y, z, v0)
-               return true
-       }
-}
-func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       b := v.Block
-       // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask)
-       // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
-       for {
-               x := v_0
-               y := v_1
-               z := v_2
-               mask := v_3
-               v.reset(OpAMD64VPDPBUSDSMasked256)
-               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-               v0.AddArg(mask)
-               v.AddArg4(x, y, z, v0)
-               return true
-       }
-}
  func rewriteValueAMD64_OpSelect0(v *Value) bool {
         v_0 := v.Args[0]
         b := v.Block
@@ -54763,6 +54361,222 @@ func rewriteValueAMD64_OpSubMaskedUint8x64(v *Value) bool {
                 return true
         }
  }
+func rewriteValueAMD64_OpSubSaturatedMaskedInt16x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedInt16x16 x y mask)
+       // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt16x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedInt16x32 x y mask)
+       // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt16x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedInt16x8 x y mask)
+       // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt8x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedInt8x16 x y mask)
+       // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt8x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedInt8x32 x y mask)
+       // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt8x64(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedInt8x64 x y mask)
+       // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint16x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedUint16x16 x y mask)
+       // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint16x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedUint16x32 x y mask)
+       // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint16x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedUint16x8 x y mask)
+       // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint8x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedUint8x16 x y mask)
+       // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint8x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedUint8x32 x y mask)
+       // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (SubSaturatedMaskedUint8x64 x y mask)
+       // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPSUBSBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
  func rewriteValueAMD64_OpTrunc(v *Value) bool {
         v_0 := v.Args[0]
         // match: (Trunc x)
@@ -54823,9 +54637,9 @@ func rewriteValueAMD64_OpTruncFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat32x16(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (TruncWithPrecisionFloat32x16 [a] x)
+       // match: (TruncScaledFloat32x16 [a] x)
         // result: (VRNDSCALEPS512 [a+3] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54836,9 +54650,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat32x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (TruncWithPrecisionFloat32x4 [a] x)
+       // match: (TruncScaledFloat32x4 [a] x)
         // result: (VRNDSCALEPS128 [a+3] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54849,9 +54663,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat32x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (TruncWithPrecisionFloat32x8 [a] x)
+       // match: (TruncScaledFloat32x8 [a] x)
         // result: (VRNDSCALEPS256 [a+3] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54862,9 +54676,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat64x2(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (TruncWithPrecisionFloat64x2 [a] x)
+       // match: (TruncScaledFloat64x2 [a] x)
         // result: (VRNDSCALEPD128 [a+3] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54875,9 +54689,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat64x4(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (TruncWithPrecisionFloat64x4 [a] x)
+       // match: (TruncScaledFloat64x4 [a] x)
         // result: (VRNDSCALEPD256 [a+3] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54888,9 +54702,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat64x8(v *Value) bool {
         v_0 := v.Args[0]
-       // match: (TruncWithPrecisionFloat64x8 [a] x)
+       // match: (TruncScaledFloat64x8 [a] x)
         // result: (VRNDSCALEPD512 [a+3] x)
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54901,11 +54715,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat32x16(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (TruncWithPrecisionMaskedFloat32x16 [a] x mask)
+       // match: (TruncScaledMaskedFloat32x16 [a] x mask)
         // result: (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54919,11 +54733,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat32x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (TruncWithPrecisionMaskedFloat32x4 [a] x mask)
+       // match: (TruncScaledMaskedFloat32x4 [a] x mask)
         // result: (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54937,11 +54751,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat32x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (TruncWithPrecisionMaskedFloat32x8 [a] x mask)
+       // match: (TruncScaledMaskedFloat32x8 [a] x mask)
         // result: (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54955,11 +54769,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat64x2(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (TruncWithPrecisionMaskedFloat64x2 [a] x mask)
+       // match: (TruncScaledMaskedFloat64x2 [a] x mask)
         // result: (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54973,11 +54787,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat64x4(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (TruncWithPrecisionMaskedFloat64x4 [a] x mask)
+       // match: (TruncScaledMaskedFloat64x4 [a] x mask)
         // result: (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -54991,11 +54805,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v *Value) bool {
                 return true
         }
  }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat64x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
         b := v.Block
-       // match: (TruncWithPrecisionMaskedFloat64x8 [a] x mask)
+       // match: (TruncScaledMaskedFloat64x8 [a] x mask)
         // result: (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
         for {
                 a := auxIntToInt8(v.AuxInt)
@@ -55009,6 +54823,192 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v *Value) bool {
                 return true
         }
  }
+func rewriteValueAMD64_OpTruncScaledResidueFloat32x16(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (TruncScaledResidueFloat32x16 [a] x)
+       // result: (VREDUCEPS512 [a+3] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS512)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat32x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (TruncScaledResidueFloat32x4 [a] x)
+       // result: (VREDUCEPS128 [a+3] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS128)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat32x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (TruncScaledResidueFloat32x8 [a] x)
+       // result: (VREDUCEPS256 [a+3] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPS256)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat64x2(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (TruncScaledResidueFloat64x2 [a] x)
+       // result: (VREDUCEPD128 [a+3] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD128)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat64x4(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (TruncScaledResidueFloat64x4 [a] x)
+       // result: (VREDUCEPD256 [a+3] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD256)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat64x8(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (TruncScaledResidueFloat64x8 [a] x)
+       // result: (VREDUCEPD512 [a+3] x)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               v.reset(OpAMD64VREDUCEPD512)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (TruncScaledResidueMaskedFloat32x16 [a] x mask)
+       // result: (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked512)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (TruncScaledResidueMaskedFloat32x4 [a] x mask)
+       // result: (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked128)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (TruncScaledResidueMaskedFloat32x8 [a] x mask)
+       // result: (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPSMasked256)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (TruncScaledResidueMaskedFloat64x2 [a] x mask)
+       // result: (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked128)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (TruncScaledResidueMaskedFloat64x4 [a] x mask)
+       // result: (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked256)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (TruncScaledResidueMaskedFloat64x8 [a] x mask)
+       // result: (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VREDUCEPDMasked512)
+               v.AuxInt = int8ToAuxInt(a + 3)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
  func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v *Value) bool {
         v_3 := v.Args[3]
         v_2 := v.Args[2]
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 7a7367ee1e7503cd7c97623984c238f157fb89ba..511974ffa1bf34995ed5a2a16636842da8ff35ab 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -101,6 +101,44 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint64x2.AddMasked", opLen3(ssa.OpAddMaskedUint64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x4.AddMasked", opLen3(ssa.OpAddMaskedUint64x4, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint64x8.AddMasked", opLen3(ssa.OpAddMaskedUint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x8.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x4.AddSub", opLen2(ssa.OpAddSubFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.AddSub", opLen2(ssa.OpAddSubFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x2.AddSub", opLen2(ssa.OpAddSubFloat64x2, types.TypeVec128), sys.AMD64)
@@ -217,18 +255,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float64x4.Ceil", opLen1(ssa.OpCeilFloat64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
         addF(simdPackage, "Float32x4.Compress", opLen2(ssa.OpCompressFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.Compress", opLen2(ssa.OpCompressFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.Compress", opLen2(ssa.OpCompressFloat32x16, types.TypeVec512), sys.AMD64)
@@ -271,54 +321,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float32x4.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
         addF(simdPackage, "Float32x4.Div", opLen2(ssa.OpDivFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.Div", opLen2(ssa.OpDivFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.Div", opLen2(ssa.OpDivFloat32x16, types.TypeVec512), sys.AMD64)
@@ -398,18 +400,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float32x8.Floor", opLen1(ssa.OpFloorFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x2.Floor", opLen1(ssa.OpFloorFloat64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float64x4.Floor", opLen1(ssa.OpFloorFloat64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
         addF(simdPackage, "Float32x4.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x16, types.TypeVec512), sys.AMD64)
@@ -860,18 +874,15 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float64x2.Mul", opLen2(ssa.OpMulFloat64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float64x4.Mul", opLen2(ssa.OpMulFloat64x4, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x8.Mul", opLen2(ssa.OpMulFloat64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float32x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float32x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x16.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float64x2.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float64x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float64x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float32x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float32x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x16.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float64x2.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float64x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float64x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.Mul", opLen2(ssa.OpMulInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.Mul", opLen2(ssa.OpMulInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.Mul", opLen2(ssa.OpMulInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.Mul", opLen2(ssa.OpMulInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.Mul", opLen2(ssa.OpMulInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x16.Mul", opLen2(ssa.OpMulInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x2.Mul", opLen2(ssa.OpMulInt64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x4.Mul", opLen2(ssa.OpMulInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x8.Mul", opLen2(ssa.OpMulInt64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Int32x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int32x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int64x2.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt64x2, types.TypeVec128), sys.AMD64)
@@ -900,30 +911,21 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint16x8.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x8, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint16x16.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x16, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint16x32.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.MulLow", opLen2(ssa.OpMulLowInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.MulLow", opLen2(ssa.OpMulLowInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x32.MulLow", opLen2(ssa.OpMulLowInt16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int32x4.MulLow", opLen2(ssa.OpMulLowInt32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int32x8.MulLow", opLen2(ssa.OpMulLowInt32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int32x16.MulLow", opLen2(ssa.OpMulLowInt32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int64x2.MulLow", opLen2(ssa.OpMulLowInt64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int64x4.MulLow", opLen2(ssa.OpMulLowInt64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int64x8.MulLow", opLen2(ssa.OpMulLowInt64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x32.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int32x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int32x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int32x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int64x2.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int64x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int64x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x4.MulMasked", opLen3(ssa.OpMulMaskedFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.MulMasked", opLen3(ssa.OpMulMaskedFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.MulMasked", opLen3(ssa.OpMulMaskedFloat32x16, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float64x2.MulMasked", opLen3(ssa.OpMulMaskedFloat64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float64x4.MulMasked", opLen3(ssa.OpMulMaskedFloat64x4, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x8.MulMasked", opLen3(ssa.OpMulMaskedFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.MulMasked", opLen3(ssa.OpMulMaskedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.MulMasked", opLen3(ssa.OpMulMaskedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.MulMasked", opLen3(ssa.OpMulMaskedInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.MulMasked", opLen3(ssa.OpMulMaskedInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.MulMasked", opLen3(ssa.OpMulMaskedInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x16.MulMasked", opLen3(ssa.OpMulMaskedInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x2.MulMasked", opLen3(ssa.OpMulMaskedInt64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x4.MulMasked", opLen3(ssa.OpMulMaskedInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x8.MulMasked", opLen3(ssa.OpMulMaskedInt64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x4.NotEqual", opLen2(ssa.OpNotEqualFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.NotEqual", opLen2(ssa.OpNotEqualFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.NotEqual", opLen2(ssa.OpNotEqualFloat32x16, types.TypeVec512), sys.AMD64)
@@ -1026,30 +1028,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float64x2.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float64x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float64x2.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float64x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
@@ -1306,76 +1284,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float32x8.Round", opLen1(ssa.OpRoundFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x2.Round", opLen1(ssa.OpRoundFloat64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float64x4.Round", opLen1(ssa.OpRoundFloat64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Int8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
         addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x32, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Uint8x16.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x16, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint8x32.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x32, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint8x64.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x64, types.TypeVec512), sys.AMD64)
@@ -1388,6 +1326,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.Scale", opLen2(ssa.OpScaleFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.Scale", opLen2(ssa.OpScaleFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.Scale", opLen2(ssa.OpScaleFloat32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x8.Set128", opLen2Imm8(ssa.OpSet128Float32x8, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Float64x4.Set128", opLen2Imm8(ssa.OpSet128Float64x4, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Int8x32.Set128", opLen2Imm8(ssa.OpSet128Int8x32, types.TypeVec256, 0), sys.AMD64)
@@ -1772,22 +1722,72 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint64x2.SubMasked", opLen3(ssa.OpSubMaskedUint64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x4.SubMasked", opLen3(ssa.OpSubMaskedUint64x4, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint64x8.SubMasked", opLen3(ssa.OpSubMaskedUint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x8.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x4.Trunc", opLen1(ssa.OpTruncFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.Trunc", opLen1(ssa.OpTruncFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x2.Trunc", opLen1(ssa.OpTruncFloat64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float64x4.Trunc", opLen1(ssa.OpTruncFloat64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float32x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float32x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float32x16.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-       addF(simdPackage, "Float64x2.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-       addF(simdPackage, "Float64x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-       addF(simdPackage, "Float64x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float32x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float32x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float32x16.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+       addF(simdPackage, "Float64x2.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+       addF(simdPackage, "Float64x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+       addF(simdPackage, "Float64x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
         addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/binary_test.go b/src/simd/binary_test.go

index b7daf736f4e26488fa4d96db886e5b1936d973d3..c82bc070e12322d63a33f0f4ff2d64c419de5758 100644 (file)
--- a/src/simd/binary_test.go
+++ b/src/simd/binary_test.go
@@ -309,42 +309,42 @@ func TestMul(t *testing.T) {
         testFloat64x2Binary(t, simd.Float64x2.Mul, mulSlice[float64])
         testFloat64x4Binary(t, simd.Float64x4.Mul, mulSlice[float64])
  
-       testInt16x16Binary(t, simd.Int16x16.MulLow, mulSlice[int16])
-       testInt16x8Binary(t, simd.Int16x8.MulLow, mulSlice[int16])
-       testInt32x4Binary(t, simd.Int32x4.MulLow, mulSlice[int32])
-       testInt32x8Binary(t, simd.Int32x8.MulLow, mulSlice[int32])
+       testInt16x16Binary(t, simd.Int16x16.Mul, mulSlice[int16])
+       testInt16x8Binary(t, simd.Int16x8.Mul, mulSlice[int16])
+       testInt32x4Binary(t, simd.Int32x4.Mul, mulSlice[int32])
+       testInt32x8Binary(t, simd.Int32x8.Mul, mulSlice[int32])
  
-       // testInt8x16Binary(t, simd.Int8x16.MulLow, mulSlice[int8]) // nope
-       // testInt8x32Binary(t, simd.Int8x32.MulLow, mulSlice[int8])
+       // testInt8x16Binary(t, simd.Int8x16.Mul, mulSlice[int8]) // nope
+       // testInt8x32Binary(t, simd.Int8x32.Mul, mulSlice[int8])
  
-       // TODO we should be able to do these, there's no difference between signed/unsigned mulLow
-       // testUint16x16Binary(t, simd.Uint16x16.MulLow, mulSlice[uint16])
-       // testUint16x8Binary(t, simd.Uint16x8.MulLow, mulSlice[uint16])
-       // testUint32x4Binary(t, simd.Uint32x4.MulLow, mulSlice[uint32])
-       // testUint32x8Binary(t, simd.Uint32x8.MulLow, mulSlice[uint32])
-       // testUint64x2Binary(t, simd.Uint64x2.MulLow, mulSlice[uint64])
-       // testUint64x4Binary(t, simd.Uint64x4.MulLow, mulSlice[uint64])
+       // TODO we should be able to do these, there's no difference between signed/unsigned Mul
+       // testUint16x16Binary(t, simd.Uint16x16.Mul, mulSlice[uint16])
+       // testUint16x8Binary(t, simd.Uint16x8.Mul, mulSlice[uint16])
+       // testUint32x4Binary(t, simd.Uint32x4.Mul, mulSlice[uint32])
+       // testUint32x8Binary(t, simd.Uint32x8.Mul, mulSlice[uint32])
+       // testUint64x2Binary(t, simd.Uint64x2.Mul, mulSlice[uint64])
+       // testUint64x4Binary(t, simd.Uint64x4.Mul, mulSlice[uint64])
  
-       // testUint8x16Binary(t, simd.Uint8x16.MulLow, mulSlice[uint8]) // nope
-       // testUint8x32Binary(t, simd.Uint8x32.MulLow, mulSlice[uint8])
+       // testUint8x16Binary(t, simd.Uint8x16.Mul, mulSlice[uint8]) // nope
+       // testUint8x32Binary(t, simd.Uint8x32.Mul, mulSlice[uint8])
  
         if simd.HasAVX512() {
-               testInt64x2Binary(t, simd.Int64x2.MulLow, mulSlice[int64]) // avx512 only
-               testInt64x4Binary(t, simd.Int64x4.MulLow, mulSlice[int64])
+               testInt64x2Binary(t, simd.Int64x2.Mul, mulSlice[int64]) // avx512 only
+               testInt64x4Binary(t, simd.Int64x4.Mul, mulSlice[int64])
  
                 testFloat32x16Binary(t, simd.Float32x16.Mul, mulSlice[float32])
                 testFloat64x8Binary(t, simd.Float64x8.Mul, mulSlice[float64])
  
-               // testInt8x64Binary(t, simd.Int8x64.MulLow, mulSlice[int8]) // nope
-               testInt16x32Binary(t, simd.Int16x32.MulLow, mulSlice[int16])
-               testInt32x16Binary(t, simd.Int32x16.MulLow, mulSlice[int32])
-               testInt64x8Binary(t, simd.Int64x8.MulLow, mulSlice[int64])
-               // testUint8x64Binary(t, simd.Uint8x64.MulLow, mulSlice[uint8]) // nope
+               // testInt8x64Binary(t, simd.Int8x64.Mul, mulSlice[int8]) // nope
+               testInt16x32Binary(t, simd.Int16x32.Mul, mulSlice[int16])
+               testInt32x16Binary(t, simd.Int32x16.Mul, mulSlice[int32])
+               testInt64x8Binary(t, simd.Int64x8.Mul, mulSlice[int64])
+               // testUint8x64Binary(t, simd.Uint8x64.Mul, mulSlice[uint8]) // nope
  
                 // TODO signed should do the job
-               // testUint16x32Binary(t, simd.Uint16x32.MulLow, mulSlice[uint16])
-               // testUint32x16Binary(t, simd.Uint32x16.MulLow, mulSlice[uint32])
-               // testUint64x8Binary(t, simd.Uint64x8.MulLow, mulSlice[uint64])
+               // testUint16x32Binary(t, simd.Uint16x32.Mul, mulSlice[uint16])
+               // testUint32x16Binary(t, simd.Uint32x16.Mul, mulSlice[uint32])
+               // testUint64x8Binary(t, simd.Uint64x8.Mul, mulSlice[uint64])
         }
  }
  
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index 5776350fe9f136bda4715facaa84c3ec8caeefa1..dc42e73a53a2dc6edb83f4f358b278cc619ebd8c 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -556,6 +556,242 @@ func (x Uint64x4) AddMasked(y Uint64x4, mask Mask64x4) Uint64x4
  // Asm: VPADDQ, CPU Feature: AVX512F
  func (x Uint64x8) AddMasked(y Uint64x8, mask Mask64x8) Uint64x8
  
+/* AddPairs */
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x4) AddPairs(y Float32x4) Float32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) AddPairs(y Float32x8) Float32x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x2) AddPairs(y Float64x2) Float64x2
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) AddPairs(y Float64x4) Float64x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Int16x8) AddPairs(y Int16x8) Int16x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Int16x16) AddPairs(y Int16x16) Int16x16
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Int32x4) AddPairs(y Int32x4) Int32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Int32x8) AddPairs(y Int32x8) Int32x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
+
+/* AddPairsSaturated */
+
+// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX
+func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
+
+// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX2
+func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
+
+/* AddSaturated */
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX
+func (x Int8x16) AddSaturated(y Int8x16) Int8x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX2
+func (x Int8x32) AddSaturated(y Int8x32) Int8x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x64) AddSaturated(y Int8x64) Int8x64
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX
+func (x Int16x8) AddSaturated(y Int16x8) Int16x8
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX2
+func (x Int16x16) AddSaturated(y Int16x16) Int16x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x32) AddSaturated(y Int16x32) Int16x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX
+func (x Uint8x16) AddSaturated(y Uint8x16) Uint8x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX2
+func (x Uint8x32) AddSaturated(y Uint8x32) Uint8x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x64) AddSaturated(y Uint8x64) Uint8x64
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX
+func (x Uint16x8) AddSaturated(y Uint16x8) Uint16x8
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX2
+func (x Uint16x16) AddSaturated(y Uint16x16) Uint16x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x32) AddSaturated(y Uint16x32) Uint16x32
+
+/* AddSaturatedMasked */
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x16) AddSaturatedMasked(y Int8x16, mask Mask8x16) Int8x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x32) AddSaturatedMasked(y Int8x32, mask Mask8x32) Int8x32
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x64) AddSaturatedMasked(y Int8x64, mask Mask8x64) Int8x64
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x8) AddSaturatedMasked(y Int16x8, mask Mask16x8) Int16x8
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x16) AddSaturatedMasked(y Int16x16, mask Mask16x16) Int16x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x32) AddSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x16) AddSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x32) AddSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x64) AddSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x8) AddSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x16) AddSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x32) AddSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32
+
  /* AddSub */
  
  // AddSub subtracts even elements and adds odd elements of two vectors.
@@ -1244,105 +1480,205 @@ func (x Float64x2) Ceil() Float64x2
  // Asm: VROUNDPD, CPU Feature: AVX
  func (x Float64x4) Ceil() Float64x4
  
-/* CeilWithPrecision */
+/* CeilScaled */
  
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) CeilWithPrecision(prec uint8) Float32x4
+func (x Float32x4) CeilScaled(prec uint8) Float32x4
  
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) CeilWithPrecision(prec uint8) Float32x8
+func (x Float32x8) CeilScaled(prec uint8) Float32x8
  
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) CeilWithPrecision(prec uint8) Float32x16
+func (x Float32x16) CeilScaled(prec uint8) Float32x16
  
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) CeilWithPrecision(prec uint8) Float64x2
+func (x Float64x2) CeilScaled(prec uint8) Float64x2
  
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) CeilWithPrecision(prec uint8) Float64x4
+func (x Float64x4) CeilScaled(prec uint8) Float64x4
  
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) CeilWithPrecision(prec uint8) Float64x8
+func (x Float64x8) CeilScaled(prec uint8) Float64x8
  
-/* CeilWithPrecisionMasked */
+/* CeilScaledMasked */
  
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) CeilWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
+func (x Float32x4) CeilScaledMasked(prec uint8, mask Mask32x4) Float32x4
  
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) CeilWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
+func (x Float32x8) CeilScaledMasked(prec uint8, mask Mask32x8) Float32x8
  
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) CeilWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
+func (x Float32x16) CeilScaledMasked(prec uint8, mask Mask32x16) Float32x16
  
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) CeilWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
+func (x Float64x2) CeilScaledMasked(prec uint8, mask Mask64x2) Float64x2
  
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) CeilWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
+func (x Float64x4) CeilScaledMasked(prec uint8, mask Mask64x4) Float64x4
  
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) CeilWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
+func (x Float64x8) CeilScaledMasked(prec uint8, mask Mask64x8) Float64x8
+
+/* CeilScaledResidue */
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) CeilScaledResidue(prec uint8) Float32x4
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) CeilScaledResidue(prec uint8) Float32x8
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) CeilScaledResidue(prec uint8) Float32x16
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) CeilScaledResidue(prec uint8) Float64x2
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) CeilScaledResidue(prec uint8) Float64x4
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) CeilScaledResidue(prec uint8) Float64x8
+
+/* CeilScaledResidueMasked */
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) CeilScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) CeilScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) CeilScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) CeilScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) CeilScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) CeilScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
  
  /* Compress */
  
@@ -1606,429 +1942,29 @@ func (x Float32x8) ConvertToUint32Masked(mask Mask32x8) Uint32x8
  // Asm: VCVTPS2UDQ, CPU Feature: AVX512F
  func (x Float32x16) ConvertToUint32Masked(mask Mask32x16) Uint32x16
  
-/* DiffWithCeilWithPrecision */
+/* Div */
  
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+// Div divides elements of two vectors.
  //
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithCeilWithPrecision(prec uint8) Float32x4
+// Asm: VDIVPS, CPU Feature: AVX
+func (x Float32x4) Div(y Float32x4) Float32x4
  
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+// Div divides elements of two vectors.
  //
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithCeilWithPrecision(prec uint8) Float32x8
+// Asm: VDIVPS, CPU Feature: AVX
+func (x Float32x8) Div(y Float32x8) Float32x8
  
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+// Div divides elements of two vectors.
  //
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithCeilWithPrecision(prec uint8) Float32x16
+// Asm: VDIVPS, CPU Feature: AVX512F
+func (x Float32x16) Div(y Float32x16) Float32x16
  
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+// Div divides elements of two vectors.
  //
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithCeilWithPrecision(prec uint8) Float64x2
+// Asm: VDIVPD, CPU Feature: AVX
+func (x Float64x2) Div(y Float64x2) Float64x2
  
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithCeilWithPrecision(prec uint8) Float64x4
-
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithCeilWithPrecision(prec uint8) Float64x8
-
-/* DiffWithCeilWithPrecisionMasked */
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* DiffWithFloorWithPrecision */
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithFloorWithPrecision(prec uint8) Float32x4
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithFloorWithPrecision(prec uint8) Float32x8
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithFloorWithPrecision(prec uint8) Float32x16
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithFloorWithPrecision(prec uint8) Float64x2
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithFloorWithPrecision(prec uint8) Float64x4
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithFloorWithPrecision(prec uint8) Float64x8
-
-/* DiffWithFloorWithPrecisionMasked */
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* DiffWithRoundWithPrecision */
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithRoundWithPrecision(prec uint8) Float32x4
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithRoundWithPrecision(prec uint8) Float32x8
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithRoundWithPrecision(prec uint8) Float32x16
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithRoundWithPrecision(prec uint8) Float64x2
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithRoundWithPrecision(prec uint8) Float64x4
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithRoundWithPrecision(prec uint8) Float64x8
-
-/* DiffWithRoundWithPrecisionMasked */
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* DiffWithTruncWithPrecision */
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithTruncWithPrecision(prec uint8) Float32x4
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithTruncWithPrecision(prec uint8) Float32x8
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithTruncWithPrecision(prec uint8) Float32x16
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithTruncWithPrecision(prec uint8) Float64x2
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithTruncWithPrecision(prec uint8) Float64x4
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithTruncWithPrecision(prec uint8) Float64x8
-
-/* DiffWithTruncWithPrecisionMasked */
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* Div */
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPS, CPU Feature: AVX
-func (x Float32x4) Div(y Float32x4) Float32x4
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPS, CPU Feature: AVX
-func (x Float32x8) Div(y Float32x8) Float32x8
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPS, CPU Feature: AVX512F
-func (x Float32x16) Div(y Float32x16) Float32x16
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPD, CPU Feature: AVX
-func (x Float64x2) Div(y Float64x2) Float64x2
-
-// Div divides elements of two vectors.
+// Div divides elements of two vectors.
  //
  // Asm: VDIVPD, CPU Feature: AVX
  func (x Float64x4) Div(y Float64x4) Float64x4
@@ -2485,105 +2421,205 @@ func (x Float64x2) Floor() Float64x2
  // Asm: VROUNDPD, CPU Feature: AVX
  func (x Float64x4) Floor() Float64x4
  
-/* FloorWithPrecision */
+/* FloorScaled */
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x4) FloorScaled(prec uint8) Float32x4
  
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaled rounds elements down with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) FloorWithPrecision(prec uint8) Float32x4
+func (x Float32x8) FloorScaled(prec uint8) Float32x8
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x16) FloorScaled(prec uint8) Float32x16
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x2) FloorScaled(prec uint8) Float64x2
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x4) FloorScaled(prec uint8) Float64x4
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x8) FloorScaled(prec uint8) Float64x8
+
+/* FloorScaledMasked */
+
+// FloorScaledMasked rounds elements down with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x4) FloorScaledMasked(prec uint8, mask Mask32x4) Float32x4
+
+// FloorScaledMasked rounds elements down with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x8) FloorScaledMasked(prec uint8, mask Mask32x8) Float32x8
+
+// FloorScaledMasked rounds elements down with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x16) FloorScaledMasked(prec uint8, mask Mask32x16) Float32x16
+
+// FloorScaledMasked rounds elements down with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x2) FloorScaledMasked(prec uint8, mask Mask64x2) Float64x2
+
+// FloorScaledMasked rounds elements down with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x4) FloorScaledMasked(prec uint8, mask Mask64x4) Float64x4
+
+// FloorScaledMasked rounds elements down with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x8) FloorScaledMasked(prec uint8, mask Mask64x8) Float64x8
+
+/* FloorScaledResidue */
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) FloorScaledResidue(prec uint8) Float32x4
  
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaledResidue computes the difference after flooring with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) FloorWithPrecision(prec uint8) Float32x8
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) FloorScaledResidue(prec uint8) Float32x8
  
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaledResidue computes the difference after flooring with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) FloorWithPrecision(prec uint8) Float32x16
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) FloorScaledResidue(prec uint8) Float32x16
  
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaledResidue computes the difference after flooring with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) FloorWithPrecision(prec uint8) Float64x2
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) FloorScaledResidue(prec uint8) Float64x2
  
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaledResidue computes the difference after flooring with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) FloorWithPrecision(prec uint8) Float64x4
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) FloorScaledResidue(prec uint8) Float64x4
  
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaledResidue computes the difference after flooring with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) FloorWithPrecision(prec uint8) Float64x8
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) FloorScaledResidue(prec uint8) Float64x8
  
-/* FloorWithPrecisionMasked */
+/* FloorScaledResidueMasked */
  
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) FloorWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) FloorScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
  
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) FloorWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) FloorScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
  
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) FloorWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) FloorScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
  
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) FloorWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) FloorScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
  
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) FloorWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) FloorScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
  
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) FloorWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) FloorScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
  
  /* FusedMultiplyAdd */
  
@@ -5427,81 +5463,50 @@ func (x Float64x4) Mul(y Float64x4) Float64x4
  // Asm: VMULPD, CPU Feature: AVX512F
  func (x Float64x8) Mul(y Float64x8) Float64x8
  
-/* MulByPowOf2 */
-
-// MulByPowOf2 multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x4) MulByPowOf2(y Float32x4) Float32x4
-
-// MulByPowOf2 multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x8) MulByPowOf2(y Float32x8) Float32x8
-
-// MulByPowOf2 multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x16) MulByPowOf2(y Float32x16) Float32x16
-
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x2) MulByPowOf2(y Float64x2) Float64x2
+// Asm: VPMULLW, CPU Feature: AVX
+func (x Int16x8) Mul(y Int16x8) Int16x8
  
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x4) MulByPowOf2(y Float64x4) Float64x4
+// Asm: VPMULLW, CPU Feature: AVX2
+func (x Int16x16) Mul(y Int16x16) Int16x16
  
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x8) MulByPowOf2(y Float64x8) Float64x8
-
-/* MulByPowOf2Masked */
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x32) Mul(y Int16x32) Int16x32
  
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x4) MulByPowOf2Masked(y Float32x4, mask Mask32x4) Float32x4
+// Asm: VPMULLD, CPU Feature: AVX
+func (x Int32x4) Mul(y Int32x4) Int32x4
  
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x8) MulByPowOf2Masked(y Float32x8, mask Mask32x8) Float32x8
+// Asm: VPMULLD, CPU Feature: AVX2
+func (x Int32x8) Mul(y Int32x8) Int32x8
  
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x16) MulByPowOf2Masked(y Float32x16, mask Mask32x16) Float32x16
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x16) Mul(y Int32x16) Int32x16
  
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x2) MulByPowOf2Masked(y Float64x2, mask Mask64x2) Float64x2
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x2) Mul(y Int64x2) Int64x2
  
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x4) MulByPowOf2Masked(y Float64x4, mask Mask64x4) Float64x4
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x4) Mul(y Int64x4) Int64x4
  
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
+// Mul multiplies corresponding elements of two vectors.
  //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x8) MulByPowOf2Masked(y Float64x8, mask Mask64x8) Float64x8
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x8) Mul(y Int64x8) Int64x8
  
  /* MulEvenWiden */
  
@@ -5691,161 +5696,112 @@ func (x Uint16x16) MulHighMasked(y Uint16x16, mask Mask16x16) Uint16x16
  // Asm: VPMULHUW, CPU Feature: AVX512BW
  func (x Uint16x32) MulHighMasked(y Uint16x32, mask Mask16x32) Uint16x32
  
-/* MulLow */
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLW, CPU Feature: AVX
-func (x Int16x8) MulLow(y Int16x8) Int16x8
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLW, CPU Feature: AVX2
-func (x Int16x16) MulLow(y Int16x16) Int16x16
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x32) MulLow(y Int16x32) Int16x32
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLD, CPU Feature: AVX
-func (x Int32x4) MulLow(y Int32x4) Int32x4
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLD, CPU Feature: AVX2
-func (x Int32x8) MulLow(y Int32x8) Int32x8
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x16) MulLow(y Int32x16) Int32x16
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x2) MulLow(y Int64x2) Int64x2
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x4) MulLow(y Int64x4) Int64x4
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x8) MulLow(y Int64x8) Int64x8
-
-/* MulLowMasked */
+/* MulMasked */
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x8) MulLowMasked(y Int16x8, mask Mask16x8) Int16x8
+// Asm: VMULPS, CPU Feature: AVX512F
+func (x Float32x4) MulMasked(y Float32x4, mask Mask32x4) Float32x4
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x16) MulLowMasked(y Int16x16, mask Mask16x16) Int16x16
+// Asm: VMULPS, CPU Feature: AVX512F
+func (x Float32x8) MulMasked(y Float32x8, mask Mask32x8) Float32x8
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x32) MulLowMasked(y Int16x32, mask Mask16x32) Int16x32
+// Asm: VMULPS, CPU Feature: AVX512F
+func (x Float32x16) MulMasked(y Float32x16, mask Mask32x16) Float32x16
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x4) MulLowMasked(y Int32x4, mask Mask32x4) Int32x4
+// Asm: VMULPD, CPU Feature: AVX512F
+func (x Float64x2) MulMasked(y Float64x2, mask Mask64x2) Float64x2
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x8) MulLowMasked(y Int32x8, mask Mask32x8) Int32x8
+// Asm: VMULPD, CPU Feature: AVX512F
+func (x Float64x4) MulMasked(y Float64x4, mask Mask64x4) Float64x4
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x16) MulLowMasked(y Int32x16, mask Mask32x16) Int32x16
+// Asm: VMULPD, CPU Feature: AVX512F
+func (x Float64x8) MulMasked(y Float64x8, mask Mask64x8) Float64x8
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x2) MulLowMasked(y Int64x2, mask Mask64x2) Int64x2
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x8) MulMasked(y Int16x8, mask Mask16x8) Int16x8
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x4) MulLowMasked(y Int64x4, mask Mask64x4) Int64x4
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x16) MulMasked(y Int16x16, mask Mask16x16) Int16x16
  
-// MulLowMasked multiplies elements and stores the low part of the result.
+// MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x8) MulLowMasked(y Int64x8, mask Mask64x8) Int64x8
-
-/* MulMasked */
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x32) MulMasked(y Int16x32, mask Mask16x32) Int16x32
  
  // MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VMULPS, CPU Feature: AVX512F
-func (x Float32x4) MulMasked(y Float32x4, mask Mask32x4) Float32x4
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x4) MulMasked(y Int32x4, mask Mask32x4) Int32x4
  
  // MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VMULPS, CPU Feature: AVX512F
-func (x Float32x8) MulMasked(y Float32x8, mask Mask32x8) Float32x8
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x8) MulMasked(y Int32x8, mask Mask32x8) Int32x8
  
  // MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VMULPS, CPU Feature: AVX512F
-func (x Float32x16) MulMasked(y Float32x16, mask Mask32x16) Float32x16
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x16) MulMasked(y Int32x16, mask Mask32x16) Int32x16
  
  // MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VMULPD, CPU Feature: AVX512F
-func (x Float64x2) MulMasked(y Float64x2, mask Mask64x2) Float64x2
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x2) MulMasked(y Int64x2, mask Mask64x2) Int64x2
  
  // MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VMULPD, CPU Feature: AVX512F
-func (x Float64x4) MulMasked(y Float64x4, mask Mask64x4) Float64x4
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x4) MulMasked(y Int64x4, mask Mask64x4) Int64x4
  
  // MulMasked multiplies corresponding elements of two vectors.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VMULPD, CPU Feature: AVX512F
-func (x Float64x8) MulMasked(y Float64x8, mask Mask64x8) Float64x8
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x8) MulMasked(y Int64x8, mask Mask64x8) Int64x8
  
  /* NotEqual */
  
@@ -6402,216 +6358,68 @@ func (x Uint32x16) OrMasked(y Uint32x16, mask Mask32x16) Uint32x16
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPORQ, CPU Feature: AVX512F
-func (x Uint64x2) OrMasked(y Uint64x2, mask Mask64x2) Uint64x2
-
-// OrMasked performs a bitwise OR operation between two vectors.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPORQ, CPU Feature: AVX512F
-func (x Uint64x4) OrMasked(y Uint64x4, mask Mask64x4) Uint64x4
-
-// OrMasked performs a bitwise OR operation between two vectors.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPORQ, CPU Feature: AVX512F
-func (x Uint64x8) OrMasked(y Uint64x8, mask Mask64x8) Uint64x8
-
-/* PairDotProd */
-
-// PairDotProd multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDWD, CPU Feature: AVX
-func (x Int16x8) PairDotProd(y Int16x8) Int32x4
-
-// PairDotProd multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDWD, CPU Feature: AVX2
-func (x Int16x16) PairDotProd(y Int16x16) Int32x8
-
-// PairDotProd multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDWD, CPU Feature: AVX512BW
-func (x Int16x32) PairDotProd(y Int16x32) Int32x16
-
-/* PairDotProdMasked */
-
-// PairDotProdMasked multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMADDWD, CPU Feature: AVX512BW
-func (x Int16x8) PairDotProdMasked(y Int16x8, mask Mask16x8) Int32x4
-
-// PairDotProdMasked multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMADDWD, CPU Feature: AVX512BW
-func (x Int16x16) PairDotProdMasked(y Int16x16, mask Mask16x16) Int32x8
-
-// PairDotProdMasked multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMADDWD, CPU Feature: AVX512BW
-func (x Int16x32) PairDotProdMasked(y Int16x32, mask Mask16x32) Int32x16
-
-/* PairwiseAdd */
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x4) PairwiseAdd(y Float32x4) Float32x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) PairwiseAdd(y Float32x8) Float32x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x2) PairwiseAdd(y Float64x2) Float64x2
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) PairwiseAdd(y Float64x4) Float64x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Int16x8) PairwiseAdd(y Int16x8) Int16x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) PairwiseAdd(y Int16x16) Int16x16
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Int32x4) PairwiseAdd(y Int32x4) Int32x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) PairwiseAdd(y Int32x8) Int32x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Uint16x8) PairwiseAdd(y Uint16x8) Uint16x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) PairwiseAdd(y Uint16x16) Uint16x16
+// Asm: VPORQ, CPU Feature: AVX512F
+func (x Uint64x2) OrMasked(y Uint64x2, mask Mask64x2) Uint64x2
  
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// OrMasked performs a bitwise OR operation between two vectors.
  //
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Uint32x4) PairwiseAdd(y Uint32x4) Uint32x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// This operation is applied selectively under a write mask.
  //
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) PairwiseAdd(y Uint32x8) Uint32x8
-
-/* PairwiseSub */
+// Asm: VPORQ, CPU Feature: AVX512F
+func (x Uint64x4) OrMasked(y Uint64x4, mask Mask64x4) Uint64x4
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// OrMasked performs a bitwise OR operation between two vectors.
  //
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x4) PairwiseSub(y Float32x4) Float32x4
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// This operation is applied selectively under a write mask.
  //
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) PairwiseSub(y Float32x8) Float32x8
+// Asm: VPORQ, CPU Feature: AVX512F
+func (x Uint64x8) OrMasked(y Uint64x8, mask Mask64x8) Uint64x8
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x2) PairwiseSub(y Float64x2) Float64x2
+/* PairDotProd */
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// PairDotProd multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
  //
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) PairwiseSub(y Float64x4) Float64x4
+// Asm: VPMADDWD, CPU Feature: AVX
+func (x Int16x8) PairDotProd(y Int16x8) Int32x4
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// PairDotProd multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
  //
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Int16x8) PairwiseSub(y Int16x8) Int16x8
+// Asm: VPMADDWD, CPU Feature: AVX2
+func (x Int16x16) PairDotProd(y Int16x16) Int32x8
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// PairDotProd multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
  //
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) PairwiseSub(y Int16x16) Int16x16
+// Asm: VPMADDWD, CPU Feature: AVX512BW
+func (x Int16x32) PairDotProd(y Int16x32) Int32x16
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Int32x4) PairwiseSub(y Int32x4) Int32x4
+/* PairDotProdMasked */
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// PairDotProdMasked multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
  //
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) PairwiseSub(y Int32x8) Int32x8
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// This operation is applied selectively under a write mask.
  //
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Uint16x8) PairwiseSub(y Uint16x8) Uint16x8
+// Asm: VPMADDWD, CPU Feature: AVX512BW
+func (x Int16x8) PairDotProdMasked(y Int16x8, mask Mask16x8) Int32x4
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// PairDotProdMasked multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
  //
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) PairwiseSub(y Uint16x16) Uint16x16
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// This operation is applied selectively under a write mask.
  //
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Uint32x4) PairwiseSub(y Uint32x4) Uint32x4
+// Asm: VPMADDWD, CPU Feature: AVX512BW
+func (x Int16x16) PairDotProdMasked(y Int16x16, mask Mask16x16) Int32x8
  
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// PairDotProdMasked multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
  //
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMADDWD, CPU Feature: AVX512BW
+func (x Int16x32) PairDotProdMasked(y Int16x32, mask Mask16x32) Int32x16
  
  /* Permute */
  
@@ -8490,526 +8298,302 @@ func (x Int64x8) RotateRightMasked(y Int64x8, mask Mask64x8) Int64x8
  // Asm: VPRORVD, CPU Feature: AVX512F
  func (x Uint32x4) RotateRightMasked(y Uint32x4, mask Mask32x4) Uint32x4
  
-// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPRORVD, CPU Feature: AVX512F
-func (x Uint32x8) RotateRightMasked(y Uint32x8, mask Mask32x8) Uint32x8
-
-// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPRORVD, CPU Feature: AVX512F
-func (x Uint32x16) RotateRightMasked(y Uint32x16, mask Mask32x16) Uint32x16
-
-// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512F
-func (x Uint64x2) RotateRightMasked(y Uint64x2, mask Mask64x2) Uint64x2
-
-// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512F
-func (x Uint64x4) RotateRightMasked(y Uint64x4, mask Mask64x4) Uint64x4
-
-// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512F
-func (x Uint64x8) RotateRightMasked(y Uint64x8, mask Mask64x8) Uint64x8
-
-/* Round */
-
-// Round rounds elements to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x4) Round() Float32x4
-
-// Round rounds elements to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x8) Round() Float32x8
-
-// Round rounds elements to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x2) Round() Float64x2
-
-// Round rounds elements to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x4) Round() Float64x4
-
-/* RoundWithPrecision */
-
-// RoundWithPrecision rounds elements with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) RoundWithPrecision(prec uint8) Float32x4
-
-// RoundWithPrecision rounds elements with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) RoundWithPrecision(prec uint8) Float32x8
-
-// RoundWithPrecision rounds elements with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) RoundWithPrecision(prec uint8) Float32x16
-
-// RoundWithPrecision rounds elements with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) RoundWithPrecision(prec uint8) Float64x2
-
-// RoundWithPrecision rounds elements with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) RoundWithPrecision(prec uint8) Float64x4
-
-// RoundWithPrecision rounds elements with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) RoundWithPrecision(prec uint8) Float64x8
-
-/* RoundWithPrecisionMasked */
-
-// RoundWithPrecisionMasked rounds elements with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) RoundWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// RoundWithPrecisionMasked rounds elements with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) RoundWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// RoundWithPrecisionMasked rounds elements with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) RoundWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// RoundWithPrecisionMasked rounds elements with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) RoundWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// RoundWithPrecisionMasked rounds elements with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) RoundWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// RoundWithPrecisionMasked rounds elements with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) RoundWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* SaturatedAdd */
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX
-func (x Int8x16) SaturatedAdd(y Int8x16) Int8x16
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX2
-func (x Int8x32) SaturatedAdd(y Int8x32) Int8x32
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedAdd(y Int8x64) Int8x64
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX
-func (x Int16x8) SaturatedAdd(y Int16x8) Int16x8
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedAdd(y Int16x16) Int16x16
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedAdd(y Int16x32) Int16x32
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX
-func (x Uint8x16) SaturatedAdd(y Uint8x16) Uint8x16
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX2
-func (x Uint8x32) SaturatedAdd(y Uint8x32) Uint8x32
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedAdd(y Uint8x64) Uint8x64
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX
-func (x Uint16x8) SaturatedAdd(y Uint16x8) Uint16x8
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX2
-func (x Uint16x16) SaturatedAdd(y Uint16x16) Uint16x16
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedAdd(y Uint16x32) Uint16x32
-
-/* SaturatedAddDotProd */
-
-// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
-func (x Int32x4) SaturatedAddDotProd(y Int16x8, z Int16x8) Int32x4
-
-// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
-func (x Int32x8) SaturatedAddDotProd(y Int16x16, z Int16x16) Int32x8
-
-// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
+// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
  //
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int32x16) SaturatedAddDotProd(y Int16x32, z Int16x32) Int32x16
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPRORVD, CPU Feature: AVX512F
+func (x Uint32x8) RotateRightMasked(y Uint32x8, mask Mask32x8) Uint32x8
  
-/* SaturatedAddDotProdMasked */
+// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPRORVD, CPU Feature: AVX512F
+func (x Uint32x16) RotateRightMasked(y Uint32x16, mask Mask32x16) Uint32x16
  
-// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int32x4) SaturatedAddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4
+// Asm: VPRORVQ, CPU Feature: AVX512F
+func (x Uint64x2) RotateRightMasked(y Uint64x2, mask Mask64x2) Uint64x2
  
-// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8
+// Asm: VPRORVQ, CPU Feature: AVX512F
+func (x Uint64x4) RotateRightMasked(y Uint64x4, mask Mask64x4) Uint64x4
  
-// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+// RotateRightMasked rotates each element in x to the right by the number of bits specified by y's corresponding elements.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16
+// Asm: VPRORVQ, CPU Feature: AVX512F
+func (x Uint64x8) RotateRightMasked(y Uint64x8, mask Mask64x8) Uint64x8
  
-/* SaturatedAddMasked */
+/* Round */
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// Round rounds elements to the nearest integer.
  //
-// This operation is applied selectively under a write mask.
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Round() Float32x4
+
+// Round rounds elements to the nearest integer.
  //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x16) SaturatedAddMasked(y Int8x16, mask Mask8x16) Int8x16
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Round() Float32x8
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// Round rounds elements to the nearest integer.
  //
-// This operation is applied selectively under a write mask.
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Round() Float64x2
+
+// Round rounds elements to the nearest integer.
  //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x32) SaturatedAddMasked(y Int8x32, mask Mask8x32) Int8x32
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Round() Float64x4
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+/* RoundScaled */
+
+// RoundScaled rounds elements with specified precision.
  //
-// This operation is applied selectively under a write mask.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedAddMasked(y Int8x64, mask Mask8x64) Int8x64
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x4) RoundScaled(prec uint8) Float32x4
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaled rounds elements with specified precision.
  //
-// This operation is applied selectively under a write mask.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x8) SaturatedAddMasked(y Int16x8, mask Mask16x8) Int16x8
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x8) RoundScaled(prec uint8) Float32x8
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaled rounds elements with specified precision.
  //
-// This operation is applied selectively under a write mask.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x16) SaturatedAddMasked(y Int16x16, mask Mask16x16) Int16x16
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x16) RoundScaled(prec uint8) Float32x16
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaled rounds elements with specified precision.
  //
-// This operation is applied selectively under a write mask.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedAddMasked(y Int16x32, mask Mask16x32) Int16x32
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x2) RoundScaled(prec uint8) Float64x2
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaled rounds elements with specified precision.
  //
-// This operation is applied selectively under a write mask.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x16) SaturatedAddMasked(y Uint8x16, mask Mask8x16) Uint8x16
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x4) RoundScaled(prec uint8) Float64x4
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaled rounds elements with specified precision.
  //
-// This operation is applied selectively under a write mask.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x32) SaturatedAddMasked(y Uint8x32, mask Mask8x32) Uint8x32
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x8) RoundScaled(prec uint8) Float64x8
+
+/* RoundScaledMasked */
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaledMasked rounds elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedAddMasked(y Uint8x64, mask Mask8x64) Uint8x64
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x4) RoundScaledMasked(prec uint8, mask Mask32x4) Float32x4
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaledMasked rounds elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x8) SaturatedAddMasked(y Uint16x8, mask Mask16x8) Uint16x8
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x8) RoundScaledMasked(prec uint8, mask Mask32x8) Float32x8
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaledMasked rounds elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x16) SaturatedAddMasked(y Uint16x16, mask Mask16x16) Uint16x16
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512F
+func (x Float32x16) RoundScaledMasked(prec uint8, mask Mask32x16) Float32x16
  
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
+// RoundScaledMasked rounds elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedAddMasked(y Uint16x32, mask Mask16x32) Uint16x32
-
-/* SaturatedPairwiseAdd */
-
-// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPHADDSW, CPU Feature: AVX
-func (x Int16x8) SaturatedPairwiseAdd(y Int16x8) Int16x8
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x2) RoundScaledMasked(prec uint8, mask Mask64x2) Float64x2
  
-// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// RoundScaledMasked rounds elements with specified precision.
  //
-// Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedPairwiseAdd(y Int16x16) Int16x16
-
-/* SaturatedPairwiseSub */
-
-// SaturatedPairwiseSub horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// This operation is applied selectively under a write mask.
  //
-// Asm: VPHSUBSW, CPU Feature: AVX
-func (x Int16x8) SaturatedPairwiseSub(y Int16x8) Int16x8
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x4) RoundScaledMasked(prec uint8, mask Mask64x4) Float64x4
  
-// SaturatedPairwiseSub horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// RoundScaledMasked rounds elements with specified precision.
  //
-// Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedPairwiseSub(y Int16x16) Int16x16
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512F
+func (x Float64x8) RoundScaledMasked(prec uint8, mask Mask64x8) Float64x8
  
-/* SaturatedSub */
+/* RoundScaledResidue */
  
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
  //
-// Asm: VPSUBSB, CPU Feature: AVX
-func (x Int8x16) SaturatedSub(y Int8x16) Int8x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPSUBSB, CPU Feature: AVX2
-func (x Int8x32) SaturatedSub(y Int8x32) Int8x32
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) RoundScaledResidue(prec uint8) Float32x4
  
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedSub(y Int8x64) Int8x64
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPSUBSW, CPU Feature: AVX
-func (x Int16x8) SaturatedSub(y Int16x8) Int16x8
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) RoundScaledResidue(prec uint8) Float32x8
  
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
  //
-// Asm: VPSUBSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedSub(y Int16x16) Int16x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedSub(y Int16x32) Int16x32
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) RoundScaledResidue(prec uint8) Float32x16
  
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
  //
-// Asm: VPSUBSB, CPU Feature: AVX
-func (x Uint8x16) SaturatedSub(y Uint8x16) Uint8x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPSUBSB, CPU Feature: AVX2
-func (x Uint8x32) SaturatedSub(y Uint8x32) Uint8x32
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) RoundScaledResidue(prec uint8) Float64x2
  
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedSub(y Uint8x64) Uint8x64
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPSUBSW, CPU Feature: AVX
-func (x Uint16x8) SaturatedSub(y Uint16x8) Uint16x8
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) RoundScaledResidue(prec uint8) Float64x4
  
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
  //
-// Asm: VPSUBSW, CPU Feature: AVX2
-func (x Uint16x16) SaturatedSub(y Uint16x16) Uint16x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedSub(y Uint16x32) Uint16x32
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) RoundScaledResidue(prec uint8) Float64x8
  
-/* SaturatedSubMasked */
+/* RoundScaledResidueMasked */
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x16) SaturatedSubMasked(y Int8x16, mask Mask8x16) Int8x16
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x32) SaturatedSubMasked(y Int8x32, mask Mask8x32) Int8x32
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) RoundScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedSubMasked(y Int8x64, mask Mask8x64) Int8x64
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) RoundScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x8) SaturatedSubMasked(y Int16x8, mask Mask16x8) Int16x8
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) RoundScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x16) SaturatedSubMasked(y Int16x16, mask Mask16x16) Int16x16
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) RoundScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedSubMasked(y Int16x32, mask Mask16x32) Int16x32
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) RoundScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x16) SaturatedSubMasked(y Uint8x16, mask Mask8x16) Uint8x16
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
-// This operation is applied selectively under a write mask.
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) RoundScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
+
+/* SaturatedAddDotProd */
+
+// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x32) SaturatedSubMasked(y Uint8x32, mask Mask8x32) Uint8x32
+// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
+func (x Int32x4) SaturatedAddDotProd(y Int16x8, z Int16x8) Int32x4
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
  //
-// This operation is applied selectively under a write mask.
+// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
+func (x Int32x8) SaturatedAddDotProd(y Int16x16, z Int16x16) Int32x8
+
+// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
  //
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedSubMasked(y Uint8x64, mask Mask8x64) Uint8x64
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x16) SaturatedAddDotProd(y Int16x32, z Int16x32) Int32x16
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+/* SaturatedAddDotProdMasked */
+
+// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x8) SaturatedSubMasked(y Uint16x8, mask Mask16x8) Uint16x8
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x4) SaturatedAddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x16) SaturatedSubMasked(y Uint16x16, mask Mask16x16) Uint16x16
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8
  
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
+// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedSubMasked(y Uint16x32, mask Mask16x32) Uint16x32
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16
  
  /* SaturatedUnsignedSignedPairDotProd */
  
@@ -9066,36 +8650,112 @@ func (x Int8x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int3
  
  // SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z.
  //
-// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
-func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8) Int32x8
+// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
+func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8) Int32x8
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
+func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16
+
+/* SaturatedUnsignedSignedQuadDotProdAccumulateMasked */
+
+// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
+func (x Int8x16) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4
+
+// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
+func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8
+
+// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
+func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16
+
+/* Scale */
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x4) Scale(y Float32x4) Float32x4
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x8) Scale(y Float32x8) Float32x8
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x16) Scale(y Float32x16) Float32x16
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x2) Scale(y Float64x2) Float64x2
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x4) Scale(y Float64x4) Float64x4
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x8) Scale(y Float64x8) Float64x8
+
+/* ScaleMasked */
+
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x4) ScaleMasked(y Float32x4, mask Mask32x4) Float32x4
  
-// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+// ScaleMasked multiplies elements by a power of 2.
  //
-// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
-func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x8) ScaleMasked(y Float32x8, mask Mask32x8) Float32x8
  
-/* SaturatedUnsignedSignedQuadDotProdAccumulateMasked */
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x16) ScaleMasked(y Float32x16, mask Mask32x16) Float32x16
  
-// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+// ScaleMasked multiplies elements by a power of 2.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
-func (x Int8x16) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x2) ScaleMasked(y Float64x2, mask Mask64x2) Float64x2
  
-// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+// ScaleMasked multiplies elements by a power of 2.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
-func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x4) ScaleMasked(y Float64x4, mask Mask64x4) Float64x4
  
-// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z.
+// ScaleMasked multiplies elements by a power of 2.
  //
  // This operation is applied selectively under a write mask.
  //
-// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
-func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x8) ScaleMasked(y Float64x8, mask Mask64x8) Float64x8
  
  /* Set128 */
  
@@ -11753,6 +11413,242 @@ func (x Uint64x4) SubMasked(y Uint64x4, mask Mask64x4) Uint64x4
  // Asm: VPSUBQ, CPU Feature: AVX512F
  func (x Uint64x8) SubMasked(y Uint64x8, mask Mask64x8) Uint64x8
  
+/* SubPairs */
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x4) SubPairs(y Float32x4) Float32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) SubPairs(y Float32x8) Float32x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x2) SubPairs(y Float64x2) Float64x2
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) SubPairs(y Float64x4) Float64x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Int16x8) SubPairs(y Int16x8) Int16x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Int16x16) SubPairs(y Int16x16) Int16x16
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Int32x4) SubPairs(y Int32x4) Int32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Int32x8) SubPairs(y Int32x8) Int32x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
+
+/* SubPairsSaturated */
+
+// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX
+func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
+
+// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX2
+func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
+
+/* SubSaturated */
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX
+func (x Int8x16) SubSaturated(y Int8x16) Int8x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX2
+func (x Int8x32) SubSaturated(y Int8x32) Int8x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x64) SubSaturated(y Int8x64) Int8x64
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX
+func (x Int16x8) SubSaturated(y Int16x8) Int16x8
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX2
+func (x Int16x16) SubSaturated(y Int16x16) Int16x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x32) SubSaturated(y Int16x32) Int16x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX
+func (x Uint8x16) SubSaturated(y Uint8x16) Uint8x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX2
+func (x Uint8x32) SubSaturated(y Uint8x32) Uint8x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x64) SubSaturated(y Uint8x64) Uint8x64
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX
+func (x Uint16x8) SubSaturated(y Uint16x8) Uint16x8
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX2
+func (x Uint16x16) SubSaturated(y Uint16x16) Uint16x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32
+
+/* SubSaturatedMasked */
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x16) SubSaturatedMasked(y Int8x16, mask Mask8x16) Int8x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x32) SubSaturatedMasked(y Int8x32, mask Mask8x32) Int8x32
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x64) SubSaturatedMasked(y Int8x64, mask Mask8x64) Int8x64
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x8) SubSaturatedMasked(y Int16x8, mask Mask16x8) Int16x8
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x16) SubSaturatedMasked(y Int16x16, mask Mask16x16) Int16x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x32) SubSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x16) SubSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x32) SubSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x64) SubSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x8) SubSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x16) SubSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x32) SubSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32
+
  /* Trunc */
  
  // Trunc truncates elements towards zero.
@@ -11775,105 +11671,205 @@ func (x Float64x2) Trunc() Float64x2
  // Asm: VROUNDPD, CPU Feature: AVX
  func (x Float64x4) Trunc() Float64x4
  
-/* TruncWithPrecision */
+/* TruncScaled */
  
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) TruncWithPrecision(prec uint8) Float32x4
+func (x Float32x4) TruncScaled(prec uint8) Float32x4
  
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) TruncWithPrecision(prec uint8) Float32x8
+func (x Float32x8) TruncScaled(prec uint8) Float32x8
  
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) TruncWithPrecision(prec uint8) Float32x16
+func (x Float32x16) TruncScaled(prec uint8) Float32x16
  
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) TruncWithPrecision(prec uint8) Float64x2
+func (x Float64x2) TruncScaled(prec uint8) Float64x2
  
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) TruncWithPrecision(prec uint8) Float64x4
+func (x Float64x4) TruncScaled(prec uint8) Float64x4
  
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) TruncWithPrecision(prec uint8) Float64x8
+func (x Float64x8) TruncScaled(prec uint8) Float64x8
  
-/* TruncWithPrecisionMasked */
+/* TruncScaledMasked */
  
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) TruncWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
+func (x Float32x4) TruncScaledMasked(prec uint8, mask Mask32x4) Float32x4
  
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) TruncWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
+func (x Float32x8) TruncScaledMasked(prec uint8, mask Mask32x8) Float32x8
  
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) TruncWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
+func (x Float32x16) TruncScaledMasked(prec uint8, mask Mask32x16) Float32x16
  
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) TruncWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
+func (x Float64x2) TruncScaledMasked(prec uint8, mask Mask64x2) Float64x2
  
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) TruncWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
+func (x Float64x4) TruncScaledMasked(prec uint8, mask Mask64x4) Float64x4
  
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
  //
  // This operation is applied selectively under a write mask.
  //
  // prec is expected to be a constant, non-constant value will trigger a runtime panic.
  //
  // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) TruncWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
+func (x Float64x8) TruncScaledMasked(prec uint8, mask Mask64x8) Float64x8
+
+/* TruncScaledResidue */
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) TruncScaledResidue(prec uint8) Float32x4
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) TruncScaledResidue(prec uint8) Float32x8
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) TruncScaledResidue(prec uint8) Float32x16
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) TruncScaledResidue(prec uint8) Float64x2
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) TruncScaledResidue(prec uint8) Float64x4
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) TruncScaledResidue(prec uint8) Float64x8
+
+/* TruncScaledResidueMasked */
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) TruncScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) TruncScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) TruncScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) TruncScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) TruncScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) TruncScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
  
  /* UnsignedSignedQuadDotProdAccumulate */
  
diff --git a/src/simd/unary_test.go b/src/simd/unary_test.go

index 4263b81cd734ab9bac164d42354e40a7b25d644e..c9fdfff0ffc61eb9eb34c9ddacfe20ea596bdf70 100644 (file)
--- a/src/simd/unary_test.go
+++ b/src/simd/unary_test.go
@@ -89,20 +89,20 @@ func TestToInt32(t *testing.T) {
         testFloat32x8UnaryToInt32(t, simd.Float32x8.ConvertToInt32, toInt32Slice[float32])
  }
  
-func TestDiffWithCeilWithPrecision(t *testing.T) {
+func TestCeilScaledResidue(t *testing.T) {
         if !simd.HasAVX512() {
                 t.Skip("Needs AVX512")
         }
         testFloat64x8UnaryFlaky(t,
-               func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(0) },
+               func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(0) },
                 map1(ceilResidueForPrecision[float64](0)),
                 0.001)
         testFloat64x8UnaryFlaky(t,
-               func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(1) },
+               func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(1) },
                 map1(ceilResidueForPrecision[float64](1)),
                 0.001)
         testFloat64x8Unary(t,
-               func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilWithPrecision(0)) },
+               func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilScaled(0)) },
                 map1[float64](func(x float64) float64 { return x - math.Ceil(x) }))
  }
author	David Chase <drchase@google.com>
	Fri, 1 Aug 2025 19:58:29 +0000 (15:58 -0400)
committer	David Chase <drchase@google.com>
	Mon, 4 Aug 2025 18:53:11 +0000 (11:53 -0700)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/binary_test.go		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history
src/simd/unary_test.go		patch \| blob \| history