v.l2 = a.l2 + b.l2
v.l3 = a.l3 + b.l3
v.l4 = a.l4 + b.l4
- // Using the generic implementation here is actually faster than the
- // assembly. Probably because the body of this function is so simple that
- // the compiler can figure out better optimizations by inlining the carry
- // propagation.
- return v.carryPropagateGeneric()
+ return v.carryPropagate()
}
// Subtract sets v = a - b, and returns v.
+++ /dev/null
-// Copyright (c) 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !purego
-
-package field
-
-//go:noescape
-func carryPropagate(v *Element)
-
-func (v *Element) carryPropagate() *Element {
- carryPropagate(v)
- return v
-}
+++ /dev/null
-// Copyright (c) 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !purego
-
-#include "textflag.h"
-
-// carryPropagate works exactly like carryPropagateGeneric and uses the
-// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but
-// avoids loading R0-R4 twice and uses LDP and STP.
-//
-// See https://golang.org/issues/43145 for the main compiler issue.
-//
-// func carryPropagate(v *Element)
-TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8
- MOVD v+0(FP), R20
-
- LDP 0(R20), (R0, R1)
- LDP 16(R20), (R2, R3)
- MOVD 32(R20), R4
-
- AND $0x7ffffffffffff, R0, R10
- AND $0x7ffffffffffff, R1, R11
- AND $0x7ffffffffffff, R2, R12
- AND $0x7ffffffffffff, R3, R13
- AND $0x7ffffffffffff, R4, R14
-
- ADD R0>>51, R11, R11
- ADD R1>>51, R12, R12
- ADD R2>>51, R13, R13
- ADD R3>>51, R14, R14
- // R4>>51 * 19 + R10 -> R10
- LSR $51, R4, R21
- MOVD $19, R22
- MADD R22, R10, R21, R10
-
- STP (R10, R11), 0(R20)
- STP (R12, R13), 16(R20)
- MOVD R14, 32(R20)
-
- RET
+++ /dev/null
-// Copyright (c) 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !arm64 || purego
-
-package field
-
-func (v *Element) carryPropagate() *Element {
- return v.carryPropagateGeneric()
-}
c3 := shiftRightBy51(r3)
c4 := shiftRightBy51(r4)
- rr0 := r0.lo&maskLow51Bits + mul19(c4)
- rr1 := r1.lo&maskLow51Bits + c0
- rr2 := r2.lo&maskLow51Bits + c1
- rr3 := r3.lo&maskLow51Bits + c2
- rr4 := r4.lo&maskLow51Bits + c3
+ v.l0 = r0.lo&maskLow51Bits + mul19(c4)
+ v.l1 = r1.lo&maskLow51Bits + c0
+ v.l2 = r2.lo&maskLow51Bits + c1
+ v.l3 = r3.lo&maskLow51Bits + c2
+ v.l4 = r4.lo&maskLow51Bits + c3
// Now all coefficients fit into 64-bit registers but are still too large to
// be passed around as an Element. We therefore do one last carry chain,
// where the carries will be small enough to fit in the wiggle room above 2⁵¹.
- *v = Element{rr0, rr1, rr2, rr3, rr4}
v.carryPropagate()
}
c3 := shiftRightBy51(r3)
c4 := shiftRightBy51(r4)
- rr0 := r0.lo&maskLow51Bits + mul19(c4)
- rr1 := r1.lo&maskLow51Bits + c0
- rr2 := r2.lo&maskLow51Bits + c1
- rr3 := r3.lo&maskLow51Bits + c2
- rr4 := r4.lo&maskLow51Bits + c3
+ v.l0 = r0.lo&maskLow51Bits + mul19(c4)
+ v.l1 = r1.lo&maskLow51Bits + c0
+ v.l2 = r2.lo&maskLow51Bits + c1
+ v.l3 = r3.lo&maskLow51Bits + c2
+ v.l4 = r4.lo&maskLow51Bits + c3
- *v = Element{rr0, rr1, rr2, rr3, rr4}
v.carryPropagate()
}
-// carryPropagateGeneric brings the limbs below 52 bits by applying the reduction
+// carryPropagate brings the limbs below 52 bits by applying the reduction
// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry.
-func (v *Element) carryPropagateGeneric() *Element {
- c0 := v.l0 >> 51
- c1 := v.l1 >> 51
- c2 := v.l2 >> 51
- c3 := v.l3 >> 51
- c4 := v.l4 >> 51
-
- // c4 is at most 64 - 51 = 13 bits, so c4*19 is at most 18 bits, and
+func (v *Element) carryPropagate() *Element {
+ // (l4>>51) is at most 64 - 51 = 13 bits, so (l4>>51)*19 is at most 18 bits, and
// the final l0 will be at most 52 bits. Similarly for the rest.
- v.l0 = v.l0&maskLow51Bits + mul19(c4)
- v.l1 = v.l1&maskLow51Bits + c0
- v.l2 = v.l2&maskLow51Bits + c1
- v.l3 = v.l3&maskLow51Bits + c2
- v.l4 = v.l4&maskLow51Bits + c3
+ l0 := v.l0
+ v.l0 = v.l0&maskLow51Bits + mul19(v.l4>>51)
+ v.l4 = v.l4&maskLow51Bits + v.l3>>51
+ v.l3 = v.l3&maskLow51Bits + v.l2>>51
+ v.l2 = v.l2&maskLow51Bits + v.l1>>51
+ v.l1 = v.l1&maskLow51Bits + l0>>51
return v
}
}
}
-func TestCarryPropagate(t *testing.T) {
- asmLikeGeneric := func(a [5]uint64) bool {
- t1 := &Element{a[0], a[1], a[2], a[3], a[4]}
- t2 := &Element{a[0], a[1], a[2], a[3], a[4]}
-
- t1.carryPropagate()
- t2.carryPropagateGeneric()
-
- if *t1 != *t2 {
- t.Logf("got: %#v,\nexpected: %#v", t1, t2)
- }
-
- return *t1 == *t2 && isInBounds(t2)
- }
-
- if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
- t.Error(err)
- }
-
- if !asmLikeGeneric([5]uint64{0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}) {
- t.Errorf("failed for {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}")
- }
-}
-
func TestFeSquare(t *testing.T) {
asmLikeGeneric := func(a Element) bool {
t1 := a