From: David Chase Date: Thu, 4 Dec 2025 22:51:04 +0000 (-0500) Subject: [dev.simd] simd: add carryless multiply X-Git-Tag: go1.26rc1~1^2~45^2~4 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=3417b48b17d01cf170317d679aef10984cc1a4d0;p=gostls13.git [dev.simd] simd: add carryless multiply now with comments, and also a test. choice of data types, method names, etc, are all up for comment. It's NOT commutative, because of the immediate operand (unless we swap the bits of the immediate). Change-Id: I730a6938c6803d0b93544445db65eadc51783e42 Reviewed-on: https://go-review.googlesource.com/c/go/+/726963 Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI --- diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 465fb980a5..f6deba3ec1 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1232,6 +1232,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSHRDQ128, ssa.OpAMD64VPSHRDQ256, ssa.OpAMD64VPSHRDQ512, + ssa.OpAMD64VPCLMULQDQ128, + ssa.OpAMD64VPCLMULQDQ256, + ssa.OpAMD64VPCLMULQDQ512, ssa.OpAMD64VSHUFPS128, ssa.OpAMD64VSHUFPD128, ssa.OpAMD64VSHUFPS256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 752fb9d9b0..649940497c 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -1333,6 +1333,9 @@ (blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM mask)) (blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM mask)) (blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM mask)) +(carrylessMultiplyUint64x2 ...) => (VPCLMULQDQ128 ...) +(carrylessMultiplyUint64x4 ...) => (VPCLMULQDQ256 ...) +(carrylessMultiplyUint64x8 ...) => (VPCLMULQDQ512 ...) (concatSelectedConstantFloat32x4 ...) => (VSHUFPS128 ...) (concatSelectedConstantFloat64x2 ...) => (VSHUFPD128 ...) (concatSelectedConstantInt32x4 ...) => (VSHUFPS128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 0727f626fb..f38d24fde7 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -1269,6 +1269,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPALIGNRMasked128", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPALIGNRMasked256", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPALIGNRMasked512", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPCLMULQDQ128", argLength: 2, reg: v21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPCLMULQDQ256", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPCLMULQDQ512", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked256", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 53bd208e34..a68d8c4122 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1301,6 +1301,9 @@ func simdGenericOps() []opData { {name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "carrylessMultiplyUint64x2", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "carrylessMultiplyUint64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "carrylessMultiplyUint64x8", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantFloat32x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantFloat64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantGroupedFloat32x8", argLength: 2, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 46a986a35a..83e7959218 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2510,6 +2510,9 @@ const ( OpAMD64VPALIGNRMasked128 OpAMD64VPALIGNRMasked256 OpAMD64VPALIGNRMasked512 + OpAMD64VPCLMULQDQ128 + OpAMD64VPCLMULQDQ256 + OpAMD64VPCLMULQDQ512 OpAMD64VPCMPB512 OpAMD64VPCMPBMasked128 OpAMD64VPCMPBMasked256 @@ -7448,6 +7451,9 @@ const ( OpTruncScaledResidueFloat64x2 OpTruncScaledResidueFloat64x4 OpTruncScaledResidueFloat64x8 + OpcarrylessMultiplyUint64x2 + OpcarrylessMultiplyUint64x4 + OpcarrylessMultiplyUint64x8 OpconcatSelectedConstantFloat32x4 OpconcatSelectedConstantFloat64x2 OpconcatSelectedConstantGroupedFloat32x8 @@ -39211,6 +39217,51 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPCLMULQDQ128", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPCLMULQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPCLMULQDQ256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPCLMULQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPCLMULQDQ512", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPCLMULQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPCMPB512", auxType: auxUInt8, @@ -95848,6 +95899,24 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "carrylessMultiplyUint64x2", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "carrylessMultiplyUint64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "carrylessMultiplyUint64x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, { name: "concatSelectedConstantFloat32x4", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 99956c56a0..19f16e1cbb 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -6307,6 +6307,15 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpblendMaskedInt64x8(v) case OpblendMaskedInt8x64: return rewriteValueAMD64_OpblendMaskedInt8x64(v) + case OpcarrylessMultiplyUint64x2: + v.Op = OpAMD64VPCLMULQDQ128 + return true + case OpcarrylessMultiplyUint64x4: + v.Op = OpAMD64VPCLMULQDQ256 + return true + case OpcarrylessMultiplyUint64x8: + v.Op = OpAMD64VPCLMULQDQ512 + return true case OpconcatSelectedConstantFloat32x4: v.Op = OpAMD64VSHUFPS128 return true diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 607311d3b2..8aa7fa4552 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -1309,6 +1309,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x2, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint64x4.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint64x8.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x8, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Float32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Float64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128, 0), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/GaloisField/categories.yaml b/src/simd/_gen/simdgen/ops/GaloisField/categories.yaml index 2582462534..bf394ee1c7 100644 --- a/src/simd/_gen/simdgen/ops/GaloisField/categories.yaml +++ b/src/simd/_gen/simdgen/ops/GaloisField/categories.yaml @@ -19,3 +19,5 @@ documentation: !string |- // NAME computes element-wise GF(2^8) multiplication with // reduction polynomial x^8 + x^4 + x^3 + x + 1. +- go: carrylessMultiply + commutative: false diff --git a/src/simd/_gen/simdgen/ops/GaloisField/go.yaml b/src/simd/_gen/simdgen/ops/GaloisField/go.yaml index e86211cb46..6684bf76d0 100644 --- a/src/simd/_gen/simdgen/ops/GaloisField/go.yaml +++ b/src/simd/_gen/simdgen/ops/GaloisField/go.yaml @@ -30,3 +30,63 @@ - *uint8 out: - *uint8 + +- go: carrylessMultiply + documentation: !string |- + // NAME computes one of four possible Galois polynomial + // products of selected high and low halves of x and y, + // depending on the value of xyHiLo, returning the 128-bit + // product in the concatenated two elements of the result. + // Bit 0 selects the low (0) or high (1) element of x and + // bit 4 selects the low (0x00) or high (0x10) element of y. + asm: V?PCLMULQDQ + in: + - go: Uint64x2 + - go: Uint64x2 + - class: immediate + immOffset: 0 + name: xyHiLo + out: + - go: Uint64x2 + overwriteElementBits: 64 + hideMaskMethods: true + +- go: carrylessMultiply + documentation: !string |- + // NAME computes one of two possible Galois polynomial + // products of selected high and low halves of each of the two + // 128-bit lanes of x and y, depending on the value of xyHiLo, + // and returns the four 128-bit products in the result's lanes. + // Bit 0 selects the low (0) or high (1) elements of x's lanes and + // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. + asm: V?PCLMULQDQ + in: + - go: Uint64x4 + - go: Uint64x4 + - class: immediate + immOffset: 0 + name: xyHiLo + out: + - go: Uint64x4 + overwriteElementBits: 64 + hideMaskMethods: true + +- go: carrylessMultiply + documentation: !string |- + // NAME computes one of four possible Galois polynomial + // products of selected high and low halves of each of the four + // 128-bit lanes of x and y, depending on the value of xyHiLo, + // and returns the four 128-bit products in the result's lanes. + // Bit 0 selects the low (0) or high (1) elements of x's lanes and + // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. + asm: V?PCLMULQDQ + in: + - go: Uint64x8 + - go: Uint64x8 + - class: immediate + immOffset: 0 + name: xyHiLo + out: + - go: Uint64x8 + overwriteElementBits: 64 + hideMaskMethods: true diff --git a/src/simd/_gen/simdgen/types.yaml b/src/simd/_gen/simdgen/types.yaml index 9dccd1e764..54b08c8fb1 100644 --- a/src/simd/_gen/simdgen/types.yaml +++ b/src/simd/_gen/simdgen/types.yaml @@ -83,6 +83,9 @@ in: !repeat - {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 256, lanes: 4} - {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 256, lanes: 4} +# Special for carryless multiply + - {class: vreg, go: Uint64x8, base: "uint", elemBits: 128, bits: 512, lanes: 8} + # Special shapes just to make VAES(ENC|DEC)(LAST)?512 work. # The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits. - {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 512, lanes: 32} diff --git a/src/simd/_gen/simdgen/xed.go b/src/simd/_gen/simdgen/xed.go index 9e9b67e77d..31a147a839 100644 --- a/src/simd/_gen/simdgen/xed.go +++ b/src/simd/_gen/simdgen/xed.go @@ -808,13 +808,14 @@ var cpuFeatureMap = map[cpuFeatureKey]string{ // the vector length suffix. // AVX-512 extension features - {"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG", - {"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI", - {"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2", - {"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI", - {"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI", - {"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ", - {"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES", + {"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG", + {"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI", + {"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2", + {"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI", + {"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI", + {"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ", + {"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES", + {"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ", // AVX 10.2 (not yet supported) {"AVX512EVEX", "AVX10_2_RC"}: "ignore", diff --git a/src/simd/cpu.go b/src/simd/cpu.go index 7c348baedc..b115910fbe 100644 --- a/src/simd/cpu.go +++ b/src/simd/cpu.go @@ -95,6 +95,14 @@ func (X86Features) AVX512VNNI() bool { return cpu.X86.HasAVX512VNNI } +// AVX512VPCLMULQDQ returns whether the CPU supports the AVX512VPCLMULQDQ feature. +// +// AVX512VPCLMULQDQ is defined on all GOARCHes, but will only return true on +// GOARCH amd64. +func (X86Features) AVX512VPCLMULQDQ() bool { + return cpu.X86.HasAVX512VPCLMULQDQ +} + // AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature. // // AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 0655f68c4e..f7538b8003 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -1194,3 +1194,21 @@ func TestPermuteScalarsLoGrouped(t *testing.T) { simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got) checkSlices(t, got, want) } + +func TestClMul(t *testing.T) { + var x = simd.LoadUint64x2Slice([]uint64{1, 5}) + var y = simd.LoadUint64x2Slice([]uint64{3, 9}) + + foo := func(v simd.Uint64x2, s []uint64) { + r := make([]uint64, 2, 2) + v.StoreSlice(r) + checkSlices[uint64](t, r, s) + } + + foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0}) + foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0}) + foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0}) + foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0}) + foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0}) + +} diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go index e54c3b2006..6d6e84ffff 100644 --- a/src/simd/ops_internal_amd64.go +++ b/src/simd/ops_internal_amd64.go @@ -52,6 +52,44 @@ func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 // Asm: VPBLENDMQ, CPU Feature: AVX512 func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 +/* carrylessMultiply */ + +// carrylessMultiply computes one of four possible Galois polynomial +// products of selected high and low halves of x and y, +// depending on the value of xyHiLo, returning the 128-bit +// product in the concatenated two elements of the result. +// Bit 0 selects the low (0) or high (1) element of x and +// bit 4 selects the low (0x00) or high (0x10) element of y. +// +// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPCLMULQDQ, CPU Feature: AVX +func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2 + +// carrylessMultiply computes one of two possible Galois polynomial +// products of selected high and low halves of each of the two +// 128-bit lanes of x and y, depending on the value of xyHiLo, +// and returns the four 128-bit products in the result's lanes. +// Bit 0 selects the low (0) or high (1) elements of x's lanes and +// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. +// +// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ +func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4 + +// carrylessMultiply computes one of four possible Galois polynomial +// products of selected high and low halves of each of the four +// 128-bit lanes of x and y, depending on the value of xyHiLo, +// and returns the four 128-bit products in the result's lanes. +// Bit 0 selects the low (0) or high (1) elements of x's lanes and +// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. +// +// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ +func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8 + /* concatSelectedConstant */ // concatSelectedConstant concatenates selected elements from x and y into the lower and upper diff --git a/src/simd/shuffles_amd64.go b/src/simd/shuffles_amd64.go index b7472f7020..96323002a4 100644 --- a/src/simd/shuffles_amd64.go +++ b/src/simd/shuffles_amd64.go @@ -1266,3 +1266,75 @@ func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 { func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 { return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) } + +// CarrylessMultiply computes one of four possible carryless +// multiplications of selected high and low halves of x and y, +// depending on the values of a and b, returning the 128-bit +// product in the concatenated two elements of the result. +// a selects the low (0) or high (1) element of x and +// b selects the low (0) or high (1) element of y. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// constant values of a and b will result in better performance, +// otherwise the intrinsic may translate into a jump table. +// +// Asm: VPCLMULQDQ, CPU Feature: AVX +func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 { + return x.carrylessMultiply(a&1+((b&1)<<4), y) +} + +// CarrylessMultiplyGrouped computes one of four possible carryless +// multiplications of selected high and low halves of each of the two +// 128-bit lanes of x and y, depending on the values of a and b, +// and returns the four 128-bit products in the result's lanes. +// a selects the low (0) or high (1) elements of x's lanes and +// b selects the low (0) or high (1) elements of y's lanes. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// constant values of a and b will result in better performance, +// otherwise the intrinsic may translate into a jump table. +// +// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ +func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 { + return x.carrylessMultiply(a&1+((b&1)<<4), y) +} + +// CarrylessMultiplyGrouped computes one of four possible carryless +// multiplications of selected high and low halves of each of the four +// 128-bit lanes of x and y, depending on the values of a and b, +// and returns the four 128-bit products in the result's lanes. +// a selects the low (0) or high (1) elements of x's lanes and +// b selects the low (0) or high (1) elements of y's lanes. +// +// A carryless multiplication uses bitwise XOR instead of +// add-with-carry, for example (in base two): +// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 +// +// This also models multiplication of polynomials with coefficients +// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = +// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds +// polynomial terms, but coefficients "add" with XOR.) +// +// constant values of a and b will result in better performance, +// otherwise the intrinsic may translate into a jump table. +// +// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ +func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 { + return x.carrylessMultiply(a&1+((b&1)<<4), y) +}