[dev.simd] cmd/compile, simd: add VPSADBW

author Junyang Shao <shaojunyang@google.com>

Wed, 20 Aug 2025 18:42:52 +0000 (18:42 +0000)

committer Junyang Shao <shaojunyang@google.com>

Thu, 21 Aug 2025 17:01:46 +0000 (10:01 -0700)
author Junyang Shao <shaojunyang@google.com>
Wed, 20 Aug 2025 18:42:52 +0000 (18:42 +0000)
committer Junyang Shao <shaojunyang@google.com>
Thu, 21 Aug 2025 17:01:46 +0000 (10:01 -0700)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index 03617d4a5dcbdfddcfff92f81a6ac80e64fc1aad..5fc85457cf01251bb7676314d327a9653ee9e685 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -368,6 +368,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPSUBUSW128,
                 ssa.OpAMD64VPSUBUSW256,
                 ssa.OpAMD64VPSUBUSW512,
+               ssa.OpAMD64VPSADBW128,
+               ssa.OpAMD64VPSADBW256,
+               ssa.OpAMD64VPSADBW512,
                 ssa.OpAMD64VPXOR128,
                 ssa.OpAMD64VPXOR256,
                 ssa.OpAMD64VPXORD512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index d5be221c0e47f401aa66297cf28a7f3bf3468f4b..d7bab7b050f1817b0cff087b0dcc0d665399c74e 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -1048,6 +1048,9 @@
  (SubSaturatedUint16x8 ...) => (VPSUBUSW128 ...)
  (SubSaturatedUint16x16 ...) => (VPSUBUSW256 ...)
  (SubSaturatedUint16x32 ...) => (VPSUBUSW512 ...)
+(SumAbsDiffUint8x16 ...) => (VPSADBW128 ...)
+(SumAbsDiffUint8x32 ...) => (VPSADBW256 ...)
+(SumAbsDiffUint8x64 ...) => (VPSADBW512 ...)
  (TruncFloat32x4 x) => (VROUNDPS128 [3] x)
  (TruncFloat32x8 x) => (VROUNDPS256 [3] x)
  (TruncFloat64x2 x) => (VROUNDPD128 [3] x)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go

index 171ae59e32a2d0a662063088f703a244a396c019..7782b43cf5bda13b66a6f08d08094ed07a8f1f09 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -652,6 +652,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPRORVQMasked128", argLength: 3, reg: w2kw, asm: "VPRORVQ", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPRORVQMasked256", argLength: 3, reg: w2kw, asm: "VPRORVQ", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPRORVQMasked512", argLength: 3, reg: w2kw, asm: "VPRORVQ", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPSADBW128", argLength: 2, reg: v21, asm: "VPSADBW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSADBW256", argLength: 2, reg: v21, asm: "VPSADBW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSADBW512", argLength: 2, reg: w21, asm: "VPSADBW", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSHLDVD128", argLength: 3, reg: w31, asm: "VPSHLDVD", commutative: false, typ: "Vec128", resultInArg0: true},
                 {name: "VPSHLDVD256", argLength: 3, reg: w31, asm: "VPSHLDVD", commutative: false, typ: "Vec256", resultInArg0: true},
                 {name: "VPSHLDVD512", argLength: 3, reg: w31, asm: "VPSHLDVD", commutative: false, typ: "Vec512", resultInArg0: true},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index 4f9877aa0344b330fb337de9c493590c5fa73754..4844d8fc0cede273f0773351b0b0b92ea15897d8 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -894,6 +894,9 @@ func simdGenericOps() []opData {
                 {name: "SubUint64x2", argLength: 2, commutative: false},
                 {name: "SubUint64x4", argLength: 2, commutative: false},
                 {name: "SubUint64x8", argLength: 2, commutative: false},
+               {name: "SumAbsDiffUint8x16", argLength: 2, commutative: false},
+               {name: "SumAbsDiffUint8x32", argLength: 2, commutative: false},
+               {name: "SumAbsDiffUint8x64", argLength: 2, commutative: false},
                 {name: "TruncFloat32x4", argLength: 1, commutative: false},
                 {name: "TruncFloat32x8", argLength: 1, commutative: false},
                 {name: "TruncFloat64x2", argLength: 1, commutative: false},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index 8375b3f8a68e2220ce9e7e95f3be9aae96e63903..c5402c6f17f0af664988e5ecf2356872312024a8 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1875,6 +1875,9 @@ const (
         OpAMD64VPRORVQMasked128
         OpAMD64VPRORVQMasked256
         OpAMD64VPRORVQMasked512
+       OpAMD64VPSADBW128
+       OpAMD64VPSADBW256
+       OpAMD64VPSADBW512
         OpAMD64VPSHLDVD128
         OpAMD64VPSHLDVD256
         OpAMD64VPSHLDVD512
@@ -5544,6 +5547,9 @@ const (
         OpSubUint64x2
         OpSubUint64x4
         OpSubUint64x8
+       OpSumAbsDiffUint8x16
+       OpSumAbsDiffUint8x32
+       OpSumAbsDiffUint8x64
         OpTruncFloat32x4
         OpTruncFloat32x8
         OpTruncFloat64x2
@@ -28457,6 +28463,48 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPSADBW128",
+               argLen: 2,
+               asm:    x86.AVPSADBW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPSADBW256",
+               argLen: 2,
+               asm:    x86.AVPSADBW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPSADBW512",
+               argLen: 2,
+               asm:    x86.AVPSADBW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
         {
                 name:         "VPSHLDVD128",
                 argLen:       3,
@@ -67898,6 +67946,21 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "SumAbsDiffUint8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SumAbsDiffUint8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SumAbsDiffUint8x64",
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "TruncFloat32x4",
                 argLen:  1,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index 924fc2ecf696f772c8c431061b772ebb6f0a4acd..70c773bc1cbe73feded41c2757e6b8a0827f2ad2 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -4123,6 +4123,15 @@ func rewriteValueAMD64(v *Value) bool {
         case OpSubUint8x64:
                 v.Op = OpAMD64VPSUBB512
                 return true
+       case OpSumAbsDiffUint8x16:
+               v.Op = OpAMD64VPSADBW128
+               return true
+       case OpSumAbsDiffUint8x32:
+               v.Op = OpAMD64VPSADBW256
+               return true
+       case OpSumAbsDiffUint8x64:
+               v.Op = OpAMD64VPSADBW512
+               return true
         case OpTailCall:
                 v.Op = OpAMD64CALLtail
                 return true
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 0fd330779eab1f66fee8cf6f4b9d5791553931ad..676cfa9032f3fafe990463bf04d39ca6725cd035 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1024,6 +1024,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint16x8.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x8, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint16x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x16, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint16x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.SumAbsDiff", opLen2(ssa.OpSumAbsDiffUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x32.SumAbsDiff", opLen2(ssa.OpSumAbsDiffUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.SumAbsDiff", opLen2(ssa.OpSumAbsDiffUint8x64, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x4.Trunc", opLen1(ssa.OpTruncFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.Trunc", opLen1(ssa.OpTruncFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x2.Trunc", opLen1(ssa.OpTruncFloat64x2, types.TypeVec128), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/ops/MLOps/categories.yaml b/src/simd/_gen/simdgen/ops/MLOps/categories.yaml

index 97381e1e34786562a87dd0a4b02ade830ac1fb1f..8e1ffeb131ac6c315fdd53df0ded848ff44a0e76 100644 (file)
--- a/src/simd/_gen/simdgen/ops/MLOps/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/MLOps/categories.yaml
@@ -45,3 +45,9 @@
    commutative: false
    documentation: !string |-
      // NAME performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+- go: SumAbsDiff
+  commutative: false
+  documentation: !string |-
+    // NAME sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+    // be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+    // This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
diff --git a/src/simd/_gen/simdgen/ops/MLOps/go.yaml b/src/simd/_gen/simdgen/ops/MLOps/go.yaml

index f6b6f135b8c338481f3fa4df51e77149e7d3d28e..5c2009dcf81ffa94c34f39e1853b2380bfbebbdc 100644 (file)
--- a/src/simd/_gen/simdgen/ops/MLOps/go.yaml
+++ b/src/simd/_gen/simdgen/ops/MLOps/go.yaml
@@ -110,4 +110,14 @@
    - *fma_op
    - *fma_op
    out:
-  - *fma_op
-\ No newline at end of file
+  - *fma_op
+- go: SumAbsDiff
+  asm: "VPSADBW"
+  in:
+  - go: $t
+    base: uint
+  - go: $t
+    base: uint
+  out:
+  - go: $t2
+    base: uint
+\ No newline at end of file
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index 019f9df1ed62df651c5cada6a0829c53ec33bcb8..4cfebb3a772fea37aee6e54d5cc38fb0afe7fdaf 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5842,6 +5842,29 @@ func (x Uint16x16) SubSaturated(y Uint16x16) Uint16x16
  // Asm: VPSUBUSW, CPU Feature: AVX512
  func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32
  
+/* SumAbsDiff */
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX
+func (x Uint8x16) SumAbsDiff(y Uint8x16) Uint16x8
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX2
+func (x Uint8x32) SumAbsDiff(y Uint8x32) Uint16x16
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX512
+func (x Uint8x64) SumAbsDiff(y Uint8x64) Uint16x32
+
  /* Trunc */
  
  // Trunc truncates elements towards zero.
author	Junyang Shao <shaojunyang@google.com>
	Wed, 20 Aug 2025 18:42:52 +0000 (18:42 +0000)
committer	Junyang Shao <shaojunyang@google.com>
	Thu, 21 Aug 2025 17:01:46 +0000 (10:01 -0700)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/_gen/simdgen/ops/MLOps/categories.yaml		patch \| blob \| history
src/simd/_gen/simdgen/ops/MLOps/go.yaml		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history