benchmark old ns/op new ns/op delta
BenchmarkAddVV_1 18.7 14.8 -20.86%
BenchmarkAddVV_2 21.8 16.6 -23.85%
BenchmarkAddVV_3 26.1 17.1 -34.48%
BenchmarkAddVV_4 30.4 21.9 -27.96%
BenchmarkAddVV_5 35.5 19.8 -44.23%
BenchmarkAddVV_1e1 63.0 28.3 -55.08%
BenchmarkAddVV_1e2 593 178 -69.98%
BenchmarkAddVV_1e3 5691 1490 -73.82%
BenchmarkAddVV_1e4 56868 20761 -63.49%
BenchmarkAddVV_1e5 569062 207679 -63.51%
BenchmarkAddVW_1 15.8 12.6 -20.25%
BenchmarkAddVW_2 17.8 13.1 -26.40%
BenchmarkAddVW_3 21.2 13.9 -34.43%
BenchmarkAddVW_4 23.6 14.7 -37.71%
BenchmarkAddVW_5 26.0 15.8 -39.23%
BenchmarkAddVW_1e1 41.3 21.6 -47.70%
BenchmarkAddVW_1e2 383 145 -62.14%
BenchmarkAddVW_1e3 3703 1264 -65.87%
BenchmarkAddVW_1e4 36920 14359 -61.11%
BenchmarkAddVW_1e5 370345 143046 -61.37%
BenchmarkAddMulVVW_1 33.2 32.5 -2.11%
BenchmarkAddMulVVW_2 58.0 57.2 -1.38%
BenchmarkAddMulVVW_3 95.2 93.9 -1.37%
BenchmarkAddMulVVW_4 108 106 -1.85%
BenchmarkAddMulVVW_5 159 156 -1.89%
BenchmarkAddMulVVW_1e1 344 340 -1.16%
BenchmarkAddMulVVW_1e2 3644 3624 -0.55%
BenchmarkAddMulVVW_1e3 37344 37208 -0.36%
BenchmarkAddMulVVW_1e4 373295 372170 -0.30%
BenchmarkAddMulVVW_1e5
3438116 3425606 -0.36%
BenchmarkBitLen0 7.21 4.32 -40.08%
BenchmarkBitLen1 6.49 4.32 -33.44%
BenchmarkBitLen2 7.23 4.32 -40.25%
BenchmarkBitLen3 6.49 4.32 -33.44%
BenchmarkBitLen4 7.22 4.32 -40.17%
BenchmarkBitLen5 6.52 4.33 -33.59%
BenchmarkBitLen8 7.22 4.32 -40.17%
BenchmarkBitLen9 6.49 4.32 -33.44%
BenchmarkBitLen16 8.66 4.32 -50.12%
BenchmarkBitLen17 7.95 4.32 -45.66%
BenchmarkBitLen31 8.69 4.32 -50.29%
BenchmarkGCD10x10 5021 5033 +0.24%
BenchmarkGCD10x100 5571 5572 +0.02%
BenchmarkGCD10x1000 6707 6729 +0.33%
BenchmarkGCD10x10000 13526 13419 -0.79%
BenchmarkGCD10x100000 85668 83242 -2.83%
BenchmarkGCD100x100 24196 23936 -1.07%
BenchmarkGCD100x1000 28802 27309 -5.18%
BenchmarkGCD100x10000 64111 51704 -19.35%
BenchmarkGCD100x100000 385840 274385 -28.89%
BenchmarkGCD1000x1000 262892 236269 -10.13%
BenchmarkGCD1000x10000 371393 277883 -25.18%
BenchmarkGCD1000x100000
1311795 589055 -55.10%
BenchmarkGCD10000x10000
9596740 6123930 -36.19%
BenchmarkGCD10000x100000
16404000 7269610 -55.68%
BenchmarkGCD100000x100000
776660000 419270000 -46.02%
BenchmarkHilbert
13478980 13402270 -0.57%
BenchmarkBinomial 9802 9440 -3.69%
BenchmarkBitset 142 142 +0.00%
BenchmarkBitsetNeg 328 279 -14.94%
BenchmarkBitsetOrig 853 861 +0.94%
BenchmarkBitsetNegOrig 1489 1444 -3.02%
BenchmarkMul
420949000 410481000 -2.49%
BenchmarkExp3Power0x10 1148 1229 +7.06%
BenchmarkExp3Power0x40 1322 1376 +4.08%
BenchmarkExp3Power0x100 2437 2486 +2.01%
BenchmarkExp3Power0x400 9456 9346 -1.16%
BenchmarkExp3Power0x1000 113623 108701 -4.33%
BenchmarkExp3Power0x4000
1134933 1101481 -2.95%
BenchmarkExp3Power0x10000
10773570 10396160 -3.50%
BenchmarkExp3Power0x40000
101362100 97788300 -3.53%
BenchmarkExp3Power0x100000
921114000 885249000 -3.89%
BenchmarkExp3Power0x400000
8323094000 7969020000 -4.25%
BenchmarkFibo
322021600 92554450 -71.26%
BenchmarkScanPi
1264583 321065 -74.61%
BenchmarkStringPiParallel
1644661 554216 -66.30%
BenchmarkScan10Base2 1111 1080 -2.79%
BenchmarkScan100Base2 6645 6345 -4.51%
BenchmarkScan1000Base2 84084 62405 -25.78%
BenchmarkScan10000Base2
3105998 932551 -69.98%
BenchmarkScan100000Base2
257234800 40113333 -84.41%
BenchmarkScan10Base8 571 573 +0.35%
BenchmarkScan100Base8 2810 2543 -9.50%
BenchmarkScan1000Base8 47383 25834 -45.48%
BenchmarkScan10000Base8
2739518 567203 -79.30%
BenchmarkScan100000Base8
253952400 36495680 -85.63%
BenchmarkScan10Base10 553 556 +0.54%
BenchmarkScan100Base10 2640 2385 -9.66%
BenchmarkScan1000Base10 50865 24049 -52.72%
BenchmarkScan10000Base10
3279916 549313 -83.25%
BenchmarkScan100000Base10
309121000 36213140 -88.29%
BenchmarkScan10Base16 478 483 +1.05%
BenchmarkScan100Base16 2353 2144 -8.88%
BenchmarkScan1000Base16 48091 24246 -49.58%
BenchmarkScan10000Base16
2858886 586475 -79.49%
BenchmarkScan100000Base16
266320000 38190500 -85.66%
BenchmarkString10Base2 736 730 -0.82%
BenchmarkString100Base2 2695 2707 +0.45%
BenchmarkString1000Base2 20549 20388 -0.78%
BenchmarkString10000Base2 212638 210782 -0.87%
BenchmarkString100000Base2
1944963 1938033 -0.36%
BenchmarkString10Base8 524 517 -1.34%
BenchmarkString100Base8 1326 1320 -0.45%
BenchmarkString1000Base8 8213 8249 +0.44%
BenchmarkString10000Base8 72204 72092 -0.16%
BenchmarkString100000Base8 769068 765993 -0.40%
BenchmarkString10Base10 1018 982 -3.54%
BenchmarkString100Base10 3485 3206 -8.01%
BenchmarkString1000Base10 37102 18935 -48.97%
BenchmarkString10000Base10 188633 88637 -53.01%
BenchmarkString100000Base10
124490300 19700940 -84.17%
BenchmarkString10Base16 509 502 -1.38%
BenchmarkString100Base16 1084 1098 +1.29%
BenchmarkString1000Base16 5641 5650 +0.16%
BenchmarkString10000Base16 46900 46745 -0.33%
BenchmarkString100000Base16 508957 505840 -0.61%
BenchmarkLeafSize0
8934320 8149465 -8.78%
BenchmarkLeafSize1 237666 118381 -50.19%
BenchmarkLeafSize2 237807 117854 -50.44%
BenchmarkLeafSize3
1688640 353494 -79.07%
BenchmarkLeafSize4 235676 116196 -50.70%
BenchmarkLeafSize5
2121896 430325 -79.72%
BenchmarkLeafSize6
1682306 351775 -79.09%
BenchmarkLeafSize7
1051847 251436 -76.10%
BenchmarkLeafSize8 232697 115674 -50.29%
BenchmarkLeafSize9
2403616 488443 -79.68%
BenchmarkLeafSize10
2120975 429545 -79.75%
BenchmarkLeafSize11
2023789 426525 -78.92%
BenchmarkLeafSize12
1684830 351985 -79.11%
BenchmarkLeafSize13
1465529 337906 -76.94%
BenchmarkLeafSize14
1050498 253872 -75.83%
BenchmarkLeafSize15 683228 197384 -71.11%
BenchmarkLeafSize16 232496 116026 -50.10%
BenchmarkLeafSize32 245841 126671 -48.47%
BenchmarkLeafSize64 301728 190285 -36.93%
Change-Id: I63e63297896d96b89c9a275b893c2b405a7e105d
Reviewed-on: https://go-review.googlesource.com/9260
Reviewed-by: David Crawshaw <crawshaw@golang.org>
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
+// TODO: Consider re-implementing using Advanced SIMD
+// once the assembler supports those instructions.
+
+// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0
- B ·mulWW_g(SB)
+ MOVD x+0(FP), R0
+ MOVD y+8(FP), R1
+ MUL R0, R1, R2
+ UMULH R0, R1, R3
+ MOVD R3, z1+16(FP)
+ MOVD R2, z0+24(FP)
+ RET
+
+// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
- B ·divWW_g(SB)
+ B ·divWW_g(SB) // ARM64 has no multiword division
+
+// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
- B ·addVV_g(SB)
+ MOVD z+0(FP), R3
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R1
+ MOVD y+48(FP), R2
+ ADDS $0, R0 // clear carry flag
+loop:
+ CBZ R0, done // careful not to touch the carry flag
+ MOVD.P 8(R1), R4
+ MOVD.P 8(R2), R5
+ ADCS R4, R5
+ MOVD.P R5, 8(R3)
+ SUB $1, R0
+ B loop
+done:
+ CSET HS, R0 // extract carry flag
+ MOVD R0, c+72(FP)
+ RET
+
+// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB),NOSPLIT,$0
- B ·subVV_g(SB)
+ MOVD z+0(FP), R3
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R1
+ MOVD y+48(FP), R2
+ CMP R0, R0 // set carry flag
+loop:
+ CBZ R0, done // careful not to touch the carry flag
+ MOVD.P 8(R1), R4
+ MOVD.P 8(R2), R5
+ SBCS R5, R4
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+ B loop
+done:
+ CSET LO, R0 // extract carry flag
+ MOVD R0, c+72(FP)
+ RET
+
+// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
- B ·addVW_g(SB)
+ MOVD z+0(FP), R3
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R1
+ MOVD y+48(FP), R2
+ CBZ R0, return_y
+ MOVD.P 8(R1), R4
+ ADDS R2, R4
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+loop:
+ CBZ R0, done // careful not to touch the carry flag
+ MOVD.P 8(R1), R4
+ ADCS $0, R4
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+ B loop
+done:
+ CSET HS, R0 // extract carry flag
+ MOVD R0, c+56(FP)
+ RET
+return_y: // z is empty; copy y to c
+ MOVD R2, c+56(FP)
+ RET
+
+// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),NOSPLIT,$0
- B ·subVW_g(SB)
+ MOVD z+0(FP), R3
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R1
+ MOVD y+48(FP), R2
+ CBZ R0, rety
+ MOVD.P 8(R1), R4
+ SUBS R2, R4
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+loop:
+ CBZ R0, done // careful not to touch the carry flag
+ MOVD.P 8(R1), R4
+ SBCS $0, R4
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+ B loop
+done:
+ CSET LO, R0 // extract carry flag
+ MOVD R0, c+56(FP)
+ RET
+rety: // z is empty; copy y to c
+ MOVD R2, c+56(FP)
+ RET
+
+// func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB),NOSPLIT,$0
B ·shlVU_g(SB)
+
+// func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB),NOSPLIT,$0
B ·shrVU_g(SB)
+
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
- B ·mulAddVWW_g(SB)
+ MOVD z+0(FP), R1
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R2
+ MOVD y+48(FP), R3
+ MOVD r+56(FP), R4
+loop:
+ CBZ R0, done
+ MOVD.P 8(R2), R5
+ UMULH R5, R3, R7
+ MUL R5, R3, R6
+ ADDS R4, R6
+ ADC $0, R7
+ MOVD.P R6, 8(R1)
+ MOVD R7, R4
+ SUB $1, R0
+ B loop
+done:
+ MOVD R4, c+64(FP)
+ RET
+
+// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
B ·addMulVVW_g(SB)
+
+// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0
B ·divWVW_g(SB)
+
+// func bitLen(x Word) (n int)
TEXT ·bitLen(SB),NOSPLIT,$0
- B ·bitLen_g(SB)
+ MOVD x+0(FP), R0
+ CLZ R0, R0
+ MOVD $64, R1
+ SUB R0, R1, R0
+ MOVD R0, n+8(FP)
+ RET