Benchmarks run on 2.8GHz Quad-Code Intel Xeon,
4GB 800MHz DDR2 FB-DIMM ("PowerMac").
benchmark old ns/op new ns/op delta
BenchmarkAddVV_1 7 7 -0.82%
BenchmarkAddVV_2 8 8 -3.46%
BenchmarkAddVV_3 10 9 -4.81%
BenchmarkAddVV_4 9 9 -1.89%
BenchmarkAddVV_5 11 10 -5.22%
BenchmarkAddVV_1e1 17 18 +4.05%
BenchmarkAddVV_1e2 117 115 -1.71%
BenchmarkAddVV_1e3 1095 1090 -0.46%
BenchmarkAddVV_1e4 13149 12679 -3.57%
BenchmarkAddVV_1e5 135133 129482 -4.18%
BenchmarkAddVW_1 6 6 -1.14%
BenchmarkAddVW_2 7 7 +3.78%
BenchmarkAddVW_3 8 8 +0.12%
BenchmarkAddVW_4 8 8 -6.52%
BenchmarkAddVW_5 9 8 -3.70%
BenchmarkAddVW_1e1 14 13 -4.29%
BenchmarkAddVW_1e2 97 96 -1.33%
BenchmarkAddVW_1e3 953 940 -1.36%
BenchmarkAddVW_1e4 9776 9527 -2.55%
BenchmarkAddVW_1e5 102396 97738 -4.55%
benchmark old MB/s new MB/s speedup
BenchmarkAddVV_1 8702.84 8774.56 1.01x
BenchmarkAddVV_2 14739.60 15277.82 1.04x
BenchmarkAddVV_3 18375.37 19398.16 1.06x
BenchmarkAddVV_4 26935.44 27464.68 1.02x
BenchmarkAddVV_5 27754.04 29423.30 1.06x
BenchmarkAddVV_1e1 37050.89 35629.72 0.96x
BenchmarkAddVV_1e2 54289.15 55533.24 1.02x
BenchmarkAddVV_1e3 58428.83 58682.53 1.00x
BenchmarkAddVV_1e4 48670.55 50475.99 1.04x
BenchmarkAddVV_1e5 47360.54 49427.66 1.04x
BenchmarkAddVW_1 10397.27 10502.23 1.01x
BenchmarkAddVW_2 17279.03 16654.13 0.96x
BenchmarkAddVW_3 23858.39 23825.89 1.00x
BenchmarkAddVW_4 29799.42 31895.06 1.07x
BenchmarkAddVW_5 34781.83 36105.11 1.04x
BenchmarkAddVW_1e1 45629.88 47597.42 1.04x
BenchmarkAddVW_1e2 65341.93 66240.04 1.01x
BenchmarkAddVW_1e3 67153.67 68069.83 1.01x
BenchmarkAddVW_1e4 65464.60 67173.83 1.03x
BenchmarkAddVW_1e5 62501.88 65480.66 1.05x
R=iant
CC=golang-dev
https://golang.org/cl/
6484056
MOVQ z+0(FP), R10
MOVQ $0, CX // c = 0
- MOVQ $0, SI // i = 0
+ MOVQ $0, SI // i = 0
// uncomment the next line to disable the unrolled loop
// JMP V1
- CMPQ DI, $4
- JL V1 // if n < 4 goto V1
+ SUBQ $4, DI // n -= 4
+ JL V1 // if n < 0 goto V1
-U1: // n >= 4
+U1: // n >= 0
// regular loop body unrolled 4x
+ RCRQ $1, CX // CF = c
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
- RCRQ $1, CX // restore CF
ADCQ 0(R9)(SI*8), R11
ADCQ 8(R9)(SI*8), R12
ADCQ 16(R9)(SI*8), R13
ADCQ 24(R9)(SI*8), R14
- RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
+ RCLQ $1, CX // c = CF
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
- CMPQ DI, $4
- JGE U1 // if n >= 4 goto U1
+ JGE U1 // if n >= 0 goto U1
-V1: CMPQ DI, $0
+V1: ADDQ $4, DI // n += 4
JLE E1 // if n <= 0 goto E1
L1: // n > 0
+ RCRQ $1, CX // CF = c
MOVQ 0(R8)(SI*8), R11
- RCRQ $1, CX // restore CF
ADCQ 0(R9)(SI*8), R11
- RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
+ RCLQ $1, CX // c = CF
ADDQ $1, SI // i++
SUBQ $1, DI // n--
// uncomment the next line to disable the unrolled loop
// JMP V2
- CMPQ DI, $4
- JL V2 // if n < 4 goto V2
+ SUBQ $4, DI // n -= 4
+ JL V2 // if n < 0 goto V2
-U2: // n >= 4
+U2: // n >= 0
// regular loop body unrolled 4x
+ RCRQ $1, CX // CF = c
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
- RCRQ $1, CX // restore CF
SBBQ 0(R9)(SI*8), R11
SBBQ 8(R9)(SI*8), R12
SBBQ 16(R9)(SI*8), R13
SBBQ 24(R9)(SI*8), R14
- RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
+ RCLQ $1, CX // c = CF
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
- CMPQ DI, $4
- JGE U2 // if n >= 4 goto U2
+ JGE U2 // if n >= 0 goto U2
-V2: CMPQ DI, $0
+V2: ADDQ $4, DI // n += 4
JLE E2 // if n <= 0 goto E2
L2: // n > 0
+ RCRQ $1, CX // CF = c
MOVQ 0(R8)(SI*8), R11
- RCRQ $1, CX // restore CF
SBBQ 0(R9)(SI*8), R11
- RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
+ RCLQ $1, CX // c = CF
ADDQ $1, SI // i++
SUBQ $1, DI // n--
// uncomment the next line to disable the unrolled loop
// JMP V3
- CMPQ DI, $4
+ SUBQ $4, DI // n -= 4
JL V3 // if n < 4 goto V3
-U3: // n >= 4
+U3: // n >= 0
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
ADCQ $0, R12
ADCQ $0, R13
ADCQ $0, R14
- RCLQ $1, CX
- ANDQ $1, CX
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
+ RCLQ $1, CX // c = CF
+ ANDQ $1, CX
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
- CMPQ DI, $4
- JGE U3 // if n >= 4 goto U3
+ JGE U3 // if n >= 0 goto U3
-V3: CMPQ DI, $0
+V3: ADDQ $4, DI // n += 4
JLE E3 // if n <= 0 goto E3
L3: // n > 0
ADDQ 0(R8)(SI*8), CX
MOVQ CX, 0(R10)(SI*8)
- RCLQ $1, CX
+ RCLQ $1, CX // c = CF
ANDQ $1, CX
ADDQ $1, SI // i++
// uncomment the next line to disable the unrolled loop
// JMP V4
- CMPQ DI, $4
+ SUBQ $4, DI // n -= 4
JL V4 // if n < 4 goto V4
-U4: // n >= 4
+U4: // n >= 0
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
SBBQ $0, R12
SBBQ $0, R13
SBBQ $0, R14
- RCLQ $1, CX
- ANDQ $1, CX
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
+ RCLQ $1, CX // c = CF
+ ANDQ $1, CX
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
- CMPQ DI, $4
- JGE U4 // if n >= 4 goto U4
+ JGE U4 // if n >= 0 goto U4
-V4: CMPQ DI, $0
+V4: ADDQ $4, DI // n += 4
JLE E4 // if n <= 0 goto E4
L4: // n > 0
MOVQ 0(R8)(SI*8), R11
SUBQ CX, R11
MOVQ R11, 0(R10)(SI*8)
- RCLQ $1, CX
+ RCLQ $1, CX // c = CF
ANDQ $1, CX
ADDQ $1, SI // i++