From: Junyang Shao <shaojunyang@google.com>
Date: Thu, 22 Jan 2026 17:44:32 +0000 (+0000)
Subject: [release-branch.go1.26] cmd/compile, simd: capture VAES instructions and fix AVX512VA... 
X-Git-Tag: go1.26rc3~9
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=0ba9ea551d8e4e6be2baa37de87340f7598c753b;p=gostls13.git

[release-branch.go1.26] cmd/compile, simd: capture VAES instructions and fix AVX512VAES feature

The code previously filters out VAES-only instructions, this CL added
them back.

This CL added the VAES feature check following the Intel xed data:

  XED_ISA_SET_VAES:              vaes.7.0.ecx.9 # avx.1.0.ecx.28

This CL also found out that the old AVX512VAES feature check is not
checking the correct bits, it also fixes it:

  XED_ISA_SET_AVX512_VAES_128:    vaes.7.0.ecx.9  aes.1.0.ecx.25  avx512f.7.0.ebx.16 avx512vl.7.0.ebx.31
  XED_ISA_SET_AVX512_VAES_256:    vaes.7.0.ecx.9  aes.1.0.ecx.25  avx512f.7.0.ebx.16 avx512vl.7.0.ebx.31
  XED_ISA_SET_AVX512_VAES_512:    vaes.7.0.ecx.9  aes.1.0.ecx.25  avx512f.7.0.ebx.16

It restricts to the most strict common set - includes avx512vl for even
512-bits although it doesn't requires it.

Change-Id: I4e2f72b312fd2411589fbc12f9ee5c63c09c2e9a
Reviewed-on: https://go-review.googlesource.com/c/go/+/738500
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
(cherry picked from commit 83b232b0af1bd498d3df099eb68e3b1e40df2527)
Reviewed-on: https://go-review.googlesource.com/c/go/+/739922
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---

diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index 339ec7fce3..bc9aa22104 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -28,16 +28,16 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VAESDEC128", argLength: 2, reg: v21, asm: "VAESDEC", commutative: false, typ: "Vec128", resultInArg0: false},
-		{name: "VAESDEC256", argLength: 2, reg: w21, asm: "VAESDEC", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VAESDEC256", argLength: 2, reg: v21, asm: "VAESDEC", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VAESDEC512", argLength: 2, reg: w21, asm: "VAESDEC", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VAESDECLAST128", argLength: 2, reg: v21, asm: "VAESDECLAST", commutative: false, typ: "Vec128", resultInArg0: false},
-		{name: "VAESDECLAST256", argLength: 2, reg: w21, asm: "VAESDECLAST", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VAESDECLAST256", argLength: 2, reg: v21, asm: "VAESDECLAST", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VAESDECLAST512", argLength: 2, reg: w21, asm: "VAESDECLAST", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VAESENC128", argLength: 2, reg: v21, asm: "VAESENC", commutative: false, typ: "Vec128", resultInArg0: false},
-		{name: "VAESENC256", argLength: 2, reg: w21, asm: "VAESENC", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VAESENC256", argLength: 2, reg: v21, asm: "VAESENC", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VAESENC512", argLength: 2, reg: w21, asm: "VAESENC", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VAESENCLAST128", argLength: 2, reg: v21, asm: "VAESENCLAST", commutative: false, typ: "Vec128", resultInArg0: false},
-		{name: "VAESENCLAST256", argLength: 2, reg: w21, asm: "VAESENCLAST", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VAESENCLAST256", argLength: 2, reg: v21, asm: "VAESENCLAST", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VAESENCLAST512", argLength: 2, reg: w21, asm: "VAESENCLAST", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VAESIMC128", argLength: 1, reg: v11, asm: "VAESIMC", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VBROADCASTSD256", argLength: 1, reg: v11, asm: "VBROADCASTSD", commutative: false, typ: "Vec256", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index a84fc161e9..816b078605 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -21199,11 +21199,11 @@ var opcodeTable = [...]opInfo{
 		asm:    x86.AVAESDEC,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
 			},
 			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
 			},
 		},
 	},
@@ -21241,11 +21241,11 @@ var opcodeTable = [...]opInfo{
 		asm:    x86.AVAESDECLAST,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
 			},
 			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
 			},
 		},
 	},
@@ -21283,11 +21283,11 @@ var opcodeTable = [...]opInfo{
 		asm:    x86.AVAESENC,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
 			},
 			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
 			},
 		},
 	},
@@ -21325,11 +21325,11 @@ var opcodeTable = [...]opInfo{
 		asm:    x86.AVAESENCLAST,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
 			},
 			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
 			},
 		},
 	},
diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
index 4dffeadb22..52459be3bb 100644
--- a/src/internal/cpu/cpu.go
+++ b/src/internal/cpu/cpu.go
@@ -56,6 +56,7 @@ var X86 struct {
 	HasSSSE3            bool
 	HasSSE41            bool
 	HasSSE42            bool
+	HasVAES             bool
 	_                   CacheLinePad
 }
 
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go
index 0e97651e3d..3c0a0adf28 100644
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -28,7 +28,7 @@ const (
 	cpuid_AVX512VBMI2     = 1 << 6
 	cpuid_SSSE3           = 1 << 9
 	cpuid_AVX512GFNI      = 1 << 8
-	cpuid_AVX512VAES      = 1 << 9
+	cpuid_VAES            = 1 << 9
 	cpuid_AVX512VNNI      = 1 << 11
 	cpuid_AVX512BITALG    = 1 << 12
 	cpuid_FMA             = 1 << 12
@@ -173,6 +173,7 @@ func doinit() {
 	X86.HasERMS = isSet(ebx7, cpuid_ERMS)
 	X86.HasADX = isSet(ebx7, cpuid_ADX)
 	X86.HasSHA = isSet(ebx7, cpuid_SHA)
+	X86.HasVAES = isSet(ecx7, cpuid_VAES) && X86.HasAVX
 
 	X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512
 	if X86.HasAVX512F {
@@ -185,7 +186,7 @@ func doinit() {
 		X86.HasAVX512VPOPCNTDQ = isSet(ecx7, cpuid_AVX512VPOPCNTDQ)
 		X86.HasAVX512VBMI = isSet(ecx7, cpuid_AVX512VBMI)
 		X86.HasAVX512VBMI2 = isSet(ecx7, cpuid_AVX512VBMI2)
-		X86.HasAVX512VAES = isSet(ecx7, cpuid_AVX512VAES)
+		X86.HasAVX512VAES = isSet(ecx7, cpuid_VAES) && X86.HasAES && isSet(ebx7, cpuid_AVX512VL)
 		X86.HasAVX512VNNI = isSet(ecx7, cpuid_AVX512VNNI)
 		X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
 		X86.HasAVX512VBMI = isSet(ecx7, cpuid_AVX512_VBMI)
diff --git a/src/simd/archsimd/_gen/simdgen/xed.go b/src/simd/archsimd/_gen/simdgen/xed.go
index 5d6fac64d0..2175fa4a8d 100644
--- a/src/simd/archsimd/_gen/simdgen/xed.go
+++ b/src/simd/archsimd/_gen/simdgen/xed.go
@@ -77,7 +77,8 @@ func loadXED(xedPath string) []*unify.Value {
 		switch {
 		case inst.RealOpcode == "N":
 			return // Skip unstable instructions
-		case !(strings.HasPrefix(inst.Extension, "AVX") || strings.HasPrefix(inst.Extension, "SHA") || inst.Extension == "FMA"):
+		case !(strings.HasPrefix(inst.Extension, "AVX") || strings.HasPrefix(inst.Extension, "SHA") ||
+			inst.Extension == "FMA" || inst.Extension == "VAES"):
 			// We're only interested in AVX and SHA instructions.
 			return
 		}
@@ -796,6 +797,7 @@ var cpuFeatureMap = map[string]string{
 	"AVXAES":   "AVXAES",
 	"SHA":      "SHA",
 	"FMA":      "FMA",
+	"VAES":     "VAES",
 
 	// AVX-512 foundational features. We combine all of these into one "AVX512" feature.
 	"AVX512F":  "AVX512",
@@ -829,6 +831,7 @@ func init() {
 
 		"AVXAES": {Virtual: true, Implies: []string{"AVX", "AES"}},
 		"FMA":    {Implies: []string{"AVX"}},
+		"VAES":   {Implies: []string{"AVX"}},
 
 		// AVX-512 subfeatures.
 		"AVX512BITALG":    {Implies: []string{"AVX512"}},
diff --git a/src/simd/archsimd/cpu.go b/src/simd/archsimd/cpu.go
index 8069ee7f26..a25556ae65 100644
--- a/src/simd/archsimd/cpu.go
+++ b/src/simd/archsimd/cpu.go
@@ -158,3 +158,13 @@ func (X86Features) FMA() bool {
 func (X86Features) SHA() bool {
 	return cpu.X86.HasSHA
 }
+
+// VAES returns whether the CPU supports the VAES feature.
+//
+// If it returns true, then the CPU also supports AVX.
+//
+// VAES is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) VAES() bool {
+	return cpu.X86.HasVAES
+}
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index ec50cc72c5..9c5bb22eee 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -19,7 +19,7 @@ func (x Uint8x16) AESDecryptLastRound(y Uint32x4) Uint8x16
 // y is the chunk of dw array in use.
 // result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
 //
-// Asm: VAESDECLAST, CPU Feature: AVX512VAES
+// Asm: VAESDECLAST, CPU Feature: VAES
 func (x Uint8x32) AESDecryptLastRound(y Uint32x8) Uint8x32
 
 // AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
@@ -45,7 +45,7 @@ func (x Uint8x16) AESDecryptOneRound(y Uint32x4) Uint8x16
 // y is the chunk of dw array in use.
 // result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
 //
-// Asm: VAESDEC, CPU Feature: AVX512VAES
+// Asm: VAESDEC, CPU Feature: VAES
 func (x Uint8x32) AESDecryptOneRound(y Uint32x8) Uint8x32
 
 // AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
@@ -71,7 +71,7 @@ func (x Uint8x16) AESEncryptLastRound(y Uint32x4) Uint8x16
 // y is the chunk of w array in use.
 // result = AddRoundKey((ShiftRows(SubBytes(x))), y)
 //
-// Asm: VAESENCLAST, CPU Feature: AVX512VAES
+// Asm: VAESENCLAST, CPU Feature: VAES
 func (x Uint8x32) AESEncryptLastRound(y Uint32x8) Uint8x32
 
 // AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
@@ -97,7 +97,7 @@ func (x Uint8x16) AESEncryptOneRound(y Uint32x4) Uint8x16
 // y is the chunk of w array in use.
 // result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
 //
-// Asm: VAESENC, CPU Feature: AVX512VAES
+// Asm: VAESENC, CPU Feature: VAES
 func (x Uint8x32) AESEncryptOneRound(y Uint32x8) Uint8x32
 
 // AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.