Benchmarks run on 3.06GHz Intel Core 2 Duo,
4GB 800MHz DDR2 SDRAM ("iMac").
benchmark old ns/op new ns/op delta
BenchmarkAddVV_1 6 6 +2.75%
BenchmarkAddVV_2 9 7 -19.71%
BenchmarkAddVV_3 9 9 +2.25%
BenchmarkAddVV_4 10 8 -20.46%
BenchmarkAddVV_5 12 10 -19.53%
BenchmarkAddVV_1e1 23 15 -32.48%
BenchmarkAddVV_1e2 213 107 -49.77%
BenchmarkAddVV_1e3 2088 993 -52.44%
BenchmarkAddVV_1e4 20874 12027 -42.38%
BenchmarkAddVV_1e5 209858 121480 -42.11%
BenchmarkAddVW_1 5 5 +0.90%
BenchmarkAddVW_2 11 11 -3.51%
BenchmarkAddVW_3 7 7 -0.27%
BenchmarkAddVW_4 8 7 -6.32%
BenchmarkAddVW_5 9 8 -10.89%
BenchmarkAddVW_1e1 17 12 -26.01%
BenchmarkAddVW_1e2 155 89 -42.32%
BenchmarkAddVW_1e3 1479 873 -40.97%
BenchmarkAddVW_1e4 13838 8764 -36.67%
BenchmarkAddVW_1e5 147353 89560 -39.22%
benchmark old MB/s new MB/s speedup
BenchmarkAddVV_1 9765.57 9508.55 0.97x
BenchmarkAddVV_2 13077.63 16284.97 1.25x
BenchmarkAddVV_3 20599.58 20156.67 0.98x
BenchmarkAddVV_4 23591.58 29516.02 1.25x
BenchmarkAddVV_5 24920.95 31194.10 1.25x
BenchmarkAddVV_1e1 27393.76 40621.71 1.48x
BenchmarkAddVV_1e2 29911.96 59592.99 1.99x
BenchmarkAddVV_1e3 30650.73 64429.84 2.10x
BenchmarkAddVV_1e4 30660.09 53213.08 1.74x
BenchmarkAddVV_1e5 30496.74 52683.46 1.73x
BenchmarkAddVW_1 11503.39 11405.98 0.99x
BenchmarkAddVW_2 11203.56 11586.92 1.03x
BenchmarkAddVW_3 26173.45 26224.75 1.00x
BenchmarkAddVW_4 30560.30 32621.94 1.07x
BenchmarkAddVW_5 33183.81 37269.94 1.12x
BenchmarkAddVW_1e1 36991.75 50098.53 1.35x
BenchmarkAddVW_1e2 41087.14 71549.93 1.74x
BenchmarkAddVW_1e3 43266.42 73279.83 1.69x
BenchmarkAddVW_1e4 46246.74 73021.97 1.58x
BenchmarkAddVW_1e5 43433.00 71459.96 1.65x
Benchmarks run on 2.8GHz Quad-Code Intel Xeon,
4GB 800MHz DDR2 FB-DIMM ("PowerMac").
benchmark old ns/op new ns/op delta
BenchmarkAddVV_1 7 7 +2.51%
BenchmarkAddVV_2 8 8 +3.70%
BenchmarkAddVV_3 10 10 +4.00%
BenchmarkAddVV_4 11 9 -19.49%
BenchmarkAddVV_5 14 11 -18.44%
BenchmarkAddVV_1e1 23 17 -27.00%
BenchmarkAddVV_1e2 234 117 -50.00%
BenchmarkAddVV_1e3 2284 1095 -52.06%
BenchmarkAddVV_1e4 22906 13149 -42.60%
BenchmarkAddVV_1e5 229860 135133 -41.21%
BenchmarkAddVW_1 6 6 +1.15%
BenchmarkAddVW_2 7 7 +1.37%
BenchmarkAddVW_3 7 8 +1.00%
BenchmarkAddVW_4 9 8 -6.93%
BenchmarkAddVW_5 10 9 -13.21%
BenchmarkAddVW_1e1 18 14 -24.32%
BenchmarkAddVW_1e2 170 97 -42.41%
BenchmarkAddVW_1e3 1619 953 -41.14%
BenchmarkAddVW_1e4 15142 9776 -35.44%
BenchmarkAddVW_1e5 160835 102396 -36.33%
benchmark old MB/s new MB/s speedup
BenchmarkAddVV_1 8928.95 8702.84 0.97x
BenchmarkAddVV_2 15298.84 14739.60 0.96x
BenchmarkAddVV_3 19116.52 18375.37 0.96x
BenchmarkAddVV_4 21644.30 26935.44 1.24x
BenchmarkAddVV_5 22771.64 27754.04 1.22x
BenchmarkAddVV_1e1 27017.62 37050.89 1.37x
BenchmarkAddVV_1e2 27326.09 54289.15 1.99x
BenchmarkAddVV_1e3 28016.84 58428.83 2.09x
BenchmarkAddVV_1e4 27939.38 48670.55 1.74x
BenchmarkAddVV_1e5 27843.00 47360.54 1.70x
BenchmarkAddVW_1 10510.97 10397.27 0.99x
BenchmarkAddVW_2 17499.71 17279.03 0.99x
BenchmarkAddVW_3 24093.93 23858.39 0.99x
BenchmarkAddVW_4 27733.08 29799.42 1.07x
BenchmarkAddVW_5 30267.17 34781.83 1.15x
BenchmarkAddVW_1e1 34566.78 45629.88 1.32x
BenchmarkAddVW_1e2 37521.89 65341.93 1.74x
BenchmarkAddVW_1e3 39513.18 67153.67 1.70x
BenchmarkAddVW_1e4 42263.80 65464.60 1.55x
BenchmarkAddVW_1e5 39792.21 62501.88 1.57x
R=iant, remyoudompheng, nightlyone, minux.ma
CC=golang-dev
https://golang.org/cl/
6482062
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
-// TODO(gri) - experiment with unrolled loops for faster execution
-
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),7,$0
MOVQ x+0(FP), AX
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),7,$0
- MOVQ z+0(FP), R10
+ MOVL n+8(FP), DI
MOVQ x+16(FP), R8
MOVQ y+32(FP), R9
- MOVL n+8(FP), R11
- MOVQ $0, BX // i = 0
- MOVQ $0, DX // c = 0
- JMP E1
-
-L1: MOVQ (R8)(BX*8), AX
- RCRQ $1, DX
- ADCQ (R9)(BX*8), AX
- RCLQ $1, DX
- MOVQ AX, (R10)(BX*8)
- ADDL $1, BX // i++
+ MOVQ z+0(FP), R10
-E1: CMPQ BX, R11 // i < n
- JL L1
+ MOVQ $0, CX // c = 0
+ MOVQ $0, SI // i = 0
- MOVQ DX, c+48(FP)
+ // uncomment the next line to disable the unrolled loop
+ // JMP V1
+
+ CMPQ DI, $4
+ JL V1 // if n < 4 goto V1
+
+U1: // n >= 4
+ // regular loop body unrolled 4x
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ RCRQ $1, CX // restore CF
+ ADCQ 0(R9)(SI*8), R11
+ ADCQ 8(R9)(SI*8), R12
+ ADCQ 16(R9)(SI*8), R13
+ ADCQ 24(R9)(SI*8), R14
+ RCLQ $1, CX // save CF
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ CMPQ DI, $4
+ JGE U1 // if n >= 4 goto U1
+
+V1: CMPQ DI, $0
+ JLE E1 // if n <= 0 goto E1
+
+L1: // n > 0
+ MOVQ 0(R8)(SI*8), R11
+ RCRQ $1, CX // restore CF
+ ADCQ 0(R9)(SI*8), R11
+ RCLQ $1, CX // save CF
+ MOVQ R11, 0(R10)(SI*8)
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L1 // if n > 0 goto L1
+
+E1: MOVQ CX, c+48(FP) // return c
RET
// func subVV(z, x, y []Word) (c Word)
-// (same as addVV_s except for SBBQ instead of ADCQ and label names)
+// (same as addVV except for SBBQ instead of ADCQ and label names)
TEXT ·subVV(SB),7,$0
- MOVQ z+0(FP), R10
+ MOVL n+8(FP), DI
MOVQ x+16(FP), R8
MOVQ y+32(FP), R9
- MOVL n+8(FP), R11
- MOVQ $0, BX // i = 0
- MOVQ $0, DX // c = 0
- JMP E2
-
-L2: MOVQ (R8)(BX*8), AX
- RCRQ $1, DX
- SBBQ (R9)(BX*8), AX
- RCLQ $1, DX
- MOVQ AX, (R10)(BX*8)
- ADDL $1, BX // i++
+ MOVQ z+0(FP), R10
-E2: CMPQ BX, R11 // i < n
- JL L2
+ MOVQ $0, CX // c = 0
+ MOVQ $0, SI // i = 0
- MOVQ DX, c+48(FP)
+ // uncomment the next line to disable the unrolled loop
+ // JMP V2
+
+ CMPQ DI, $4
+ JL V2 // if n < 4 goto V2
+
+U2: // n >= 4
+ // regular loop body unrolled 4x
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ RCRQ $1, CX // restore CF
+ SBBQ 0(R9)(SI*8), R11
+ SBBQ 8(R9)(SI*8), R12
+ SBBQ 16(R9)(SI*8), R13
+ SBBQ 24(R9)(SI*8), R14
+ RCLQ $1, CX // save CF
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ CMPQ DI, $4
+ JGE U2 // if n >= 4 goto U2
+
+V2: CMPQ DI, $0
+ JLE E2 // if n <= 0 goto E2
+
+L2: // n > 0
+ MOVQ 0(R8)(SI*8), R11
+ RCRQ $1, CX // restore CF
+ SBBQ 0(R9)(SI*8), R11
+ RCLQ $1, CX // save CF
+ MOVQ R11, 0(R10)(SI*8)
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L2 // if n > 0 goto L2
+
+E2: MOVQ CX, c+48(FP) // return c
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),7,$0
- MOVQ z+0(FP), R10
+ MOVL n+8(FP), DI
MOVQ x+16(FP), R8
- MOVQ y+32(FP), AX // c = y
- MOVL n+8(FP), R11
- MOVQ $0, BX // i = 0
- JMP E3
-
-L3: ADDQ (R8)(BX*8), AX
- MOVQ AX, (R10)(BX*8)
- RCLQ $1, AX
- ANDQ $1, AX
- ADDL $1, BX // i++
-
-E3: CMPQ BX, R11 // i < n
- JL L3
+ MOVQ y+32(FP), CX // c = y
+ MOVQ z+0(FP), R10
+
+ MOVQ $0, SI // i = 0
- MOVQ AX, c+40(FP)
+ // uncomment the next line to disable the unrolled loop
+ // JMP V3
+
+ CMPQ DI, $4
+ JL V3 // if n < 4 goto V3
+
+U3: // n >= 4
+ // regular loop body unrolled 4x
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ ADDQ CX, R11
+ ADCQ $0, R12
+ ADCQ $0, R13
+ ADCQ $0, R14
+ RCLQ $1, CX
+ ANDQ $1, CX
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ CMPQ DI, $4
+ JGE U3 // if n >= 4 goto U3
+
+V3: CMPQ DI, $0
+ JLE E3 // if n <= 0 goto E3
+
+L3: // n > 0
+ ADDQ 0(R8)(SI*8), CX
+ MOVQ CX, 0(R10)(SI*8)
+ RCLQ $1, CX
+ ANDQ $1, CX
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L3 // if n > 0 goto L3
+
+E3: MOVQ CX, c+40(FP) // return c
RET
// func subVW(z, x []Word, y Word) (c Word)
+// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
TEXT ·subVW(SB),7,$0
- MOVQ z+0(FP), R10
+ MOVL n+8(FP), DI
MOVQ x+16(FP), R8
- MOVQ y+32(FP), AX // c = y
- MOVL n+8(FP), R11
- MOVQ $0, BX // i = 0
- JMP E4
-
-L4: MOVQ (R8)(BX*8), DX // TODO(gri) is there a reverse SUBQ?
- SUBQ AX, DX
- MOVQ DX, (R10)(BX*8)
- RCLQ $1, AX
- ANDQ $1, AX
- ADDL $1, BX // i++
-
-E4: CMPQ BX, R11 // i < n
- JL L4
+ MOVQ y+32(FP), CX // c = y
+ MOVQ z+0(FP), R10
+
+ MOVQ $0, SI // i = 0
- MOVQ AX, c+40(FP)
+ // uncomment the next line to disable the unrolled loop
+ // JMP V4
+
+ CMPQ DI, $4
+ JL V4 // if n < 4 goto V4
+
+U4: // n >= 4
+ // regular loop body unrolled 4x
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ SUBQ CX, R11
+ SBBQ $0, R12
+ SBBQ $0, R13
+ SBBQ $0, R14
+ RCLQ $1, CX
+ ANDQ $1, CX
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ CMPQ DI, $4
+ JGE U4 // if n >= 4 goto U4
+
+V4: CMPQ DI, $0
+ JLE E4 // if n <= 0 goto E4
+
+L4: // n > 0
+ MOVQ 0(R8)(SI*8), R11
+ SUBQ CX, R11
+ MOVQ R11, 0(R10)(SI*8)
+ RCLQ $1, CX
+ ANDQ $1, CX
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L4 // if n > 0 goto L4
+
+E4: MOVQ CX, c+40(FP) // return c
RET