(analog to Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae)
Fixes #9243.
benchmark old ns/op new ns/op delta
BenchmarkAddVV_1 5.76 5.60 -2.78%
BenchmarkAddVV_2 7.17 6.98 -2.65%
BenchmarkAddVV_3 8.69 8.57 -1.38%
BenchmarkAddVV_4 10.5 10.5 +0.00%
BenchmarkAddVV_5 13.3 11.6 -12.78%
BenchmarkAddVV_1e1 20.4 19.3 -5.39%
BenchmarkAddVV_1e2 166 140 -15.66%
BenchmarkAddVV_1e3 1588 1278 -19.52%
BenchmarkAddVV_1e4 16138 12657 -21.57%
BenchmarkAddVV_1e5 167608 127836 -23.73%
BenchmarkAddVW_1 4.87 4.76 -2.26%
BenchmarkAddVW_2 6.10 6.07 -0.49%
BenchmarkAddVW_3 7.75 7.65 -1.29%
BenchmarkAddVW_4 9.30 9.39 +0.97%
BenchmarkAddVW_5 10.8 10.9 +0.93%
BenchmarkAddVW_1e1 18.8 18.8 +0.00%
BenchmarkAddVW_1e2 143 134 -6.29%
BenchmarkAddVW_1e3 1390 1266 -8.92%
BenchmarkAddVW_1e4 13877 12545 -9.60%
BenchmarkAddVW_1e5 155330 125432 -19.25%
benchmark old MB/s new MB/s speedup
BenchmarkAddVV_1 5556.09 5715.12 1.03x
BenchmarkAddVV_2 8926.55 9170.64 1.03x
BenchmarkAddVV_3 11042.15 11201.77 1.01x
BenchmarkAddVV_4 12168.21 12245.50 1.01x
BenchmarkAddVV_5 12041.39 13805.73 1.15x
BenchmarkAddVV_1e1 15659.65 16548.18 1.06x
BenchmarkAddVV_1e2 19268.57 22728.64 1.18x
BenchmarkAddVV_1e3 20141.45 25033.36 1.24x
BenchmarkAddVV_1e4 19827.86 25281.92 1.28x
BenchmarkAddVV_1e5 19092.06 25031.92 1.31x
BenchmarkAddVW_1 822.12 840.92 1.02x
BenchmarkAddVW_2 1310.89 1317.89 1.01x
BenchmarkAddVW_3 1549.31 1568.26 1.01x
BenchmarkAddVW_4 1720.45 1703.77 0.99x
BenchmarkAddVW_5 1857.12 1828.66 0.98x
BenchmarkAddVW_1e1 2126.39 2132.38 1.00x
BenchmarkAddVW_1e2 2784.49 2969.21 1.07x
BenchmarkAddVW_1e3 2876.89 3157.35 1.10x
BenchmarkAddVW_1e4 2882.32 3188.51 1.11x
BenchmarkAddVW_1e5 2575.16 3188.96 1.24x
(measured on OS X 10.9.5, 2.3 GHz Intel Core i7, 8GB 1333 MHz DDR3)
Change-Id: I46698729d5e0bc3e277aa0146a9d7a086c0c26f1
Reviewed-on: https://go-review.googlesource.com/2560
Reviewed-by: Keith Randall <khr@golang.org>
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
-// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively.
-
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0
MOVL x+0(FP), AX
JMP E1
L1: MOVL (SI)(BX*4), AX
- RCRL $1, DX
+ ADDL DX, DX // restore CF
ADCL (CX)(BX*4), AX
- RCLL $1, DX
+ SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++
E1: CMPL BX, BP // i < n
JL L1
+ NEGL DX
MOVL DX, c+36(FP)
RET
JMP E2
L2: MOVL (SI)(BX*4), AX
- RCRL $1, DX
+ ADDL DX, DX // restore CF
SBBL (CX)(BX*4), AX
- RCLL $1, DX
+ SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++
E2: CMPL BX, BP // i < n
JL L2
+ NEGL DX
MOVL DX, c+36(FP)
RET
L3: ADDL (SI)(BX*4), AX
MOVL AX, (DI)(BX*4)
- RCLL $1, AX
- ANDL $1, AX
+ SBBL AX, AX // save CF
+ NEGL AX
ADDL $1, BX // i++
E3: CMPL BX, BP // i < n
MOVL $0, BX // i = 0
JMP E4
-L4: MOVL (SI)(BX*4), DX // TODO(gri) is there a reverse SUBL?
+L4: MOVL (SI)(BX*4), DX
SUBL AX, DX
MOVL DX, (DI)(BX*4)
- RCLL $1, AX
- ANDL $1, AX
+ SBBL AX, AX // save CF
+ NEGL AX
ADDL $1, BX // i++
E4: CMPL BX, BP // i < n