benchmark old ns/op new ns/op delta
BenchmarkAddVW_1 8 8 +0.60%
BenchmarkAddVW_2 10 9 -8.64%
BenchmarkAddVW_3 10 10 -4.63%
BenchmarkAddVW_4 10 11 +3.67%
BenchmarkAddVW_5 11 12 +5.98%
BenchmarkAddVW_1e1 18 20 +6.38%
BenchmarkAddVW_1e2 129 115 -10.85%
BenchmarkAddVW_1e3 1270 1089 -14.25%
BenchmarkAddVW_1e4 13376 12145 -9.20%
BenchmarkAddVW_1e5 130392 125260 -3.94%
benchmark old MB/s new MB/s speedup
BenchmarkAddVW_1 7709.10 7661.92 0.99x
BenchmarkAddVW_2 12451.10 13604.00 1.09x
BenchmarkAddVW_3 17727.81 18721.54 1.06x
BenchmarkAddVW_4 23552.64 22708.81 0.96x
BenchmarkAddVW_5 27411.40 25816.22 0.94x
BenchmarkAddVW_1e1 34063.19 32023.06 0.94x
BenchmarkAddVW_1e2 49529.97 55360.55 1.12x
BenchmarkAddVW_1e3 50380.44 58764.18 1.17x
BenchmarkAddVW_1e4 47843.59 52696.10 1.10x
BenchmarkAddVW_1e5 49082.60 51093.66 1.04x
R=gri, rsc, r
CC=golang-dev
https://golang.org/cl/
6480063
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
+// Literal instruction for MOVQ $0, CX.
+// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
+#define ZERO_CX BYTE $0x48; \
+ BYTE $0xc7; \
+ BYTE $0xc1; \
+ BYTE $0x00; \
+ BYTE $0x00; \
+ BYTE $0x00; \
+ BYTE $0x00
+
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),7,$0
MOVQ x+0(FP), AX
MOVQ x+16(FP), R8
MOVQ y+32(FP), CX // c = y
MOVQ z+0(FP), R10
-
+
MOVQ $0, SI // i = 0
// s/JL/JMP/ below to disable the unrolled loop
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
ADDQ CX, R11
+ ZERO_CX
ADCQ $0, R12
ADCQ $0, R13
ADCQ $0, R14
+ SETCS CX // c = CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
- RCLQ $1, CX // c = CF
- ANDQ $1, CX
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
L3: // n > 0
ADDQ 0(R8)(SI*8), CX
MOVQ CX, 0(R10)(SI*8)
+ ZERO_CX
RCLQ $1, CX // c = CF
- ANDQ $1, CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
SUBQ CX, R11
+ ZERO_CX
SBBQ $0, R12
SBBQ $0, R13
SBBQ $0, R14
+ SETCS CX // c = CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
- RCLQ $1, CX // c = CF
- ANDQ $1, CX
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
MOVQ 0(R8)(SI*8), R11
SUBQ CX, R11
MOVQ R11, 0(R10)(SI*8)
+ ZERO_CX
RCLQ $1, CX // c = CF
- ANDQ $1, CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--