]> Cypherpunks repositories - gostls13.git/commitdiff
math: optimize ceil/floor functions on amd64
authorIlya Tocar <ilya.tocar@intel.com>
Mon, 14 Sep 2015 15:42:39 +0000 (18:42 +0300)
committerBrad Fitzpatrick <bradfitz@golang.org>
Sat, 3 Oct 2015 15:55:08 +0000 (15:55 +0000)
Use SSE 4.1 rounding instruction to perform rounding
Results (haswell):

name      old time/op  new time/op  delta
Floor-48  2.71ns ± 0%  1.87ns ± 1%  -31.17%  (p=0.000 n=16+19)
Ceil-48   3.09ns ± 3%  2.16ns ± 0%  -30.16%  (p=0.000 n=19+12)

Change-Id: If63715879eed6530b1eb4fc96132d827f8f43909
Reviewed-on: https://go-review.googlesource.com/14561
Reviewed-by: Klaus Post <klauspost@gmail.com>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/internal/obj/x86/a.out.go
src/cmd/internal/obj/x86/anames.go
src/cmd/internal/obj/x86/asm6.go
src/math/floor_amd64.s
src/math/floor_asm.go [new file with mode: 0644]

index c7f46e1801c7ffcb7d4d355eeb94e18cf89c836c..95868a8ba9b686aed05c0a662269242fe72771ca 100644 (file)
@@ -734,6 +734,11 @@ const (
        AAESIMC
        AAESKEYGENASSIST
 
+       AROUNDPS
+       AROUNDSS
+       AROUNDPD
+       AROUNDSD
+
        APSHUFD
        APCLMULQDQ
 
index 7f7708cdccc91995668da1427b5ce68d5e4f7fb1..330e816119e2c5b9f7f77646f72cb3886d59ad70 100644 (file)
@@ -677,6 +677,10 @@ var Anames = []string{
        "AESDECLAST",
        "AESIMC",
        "AESKEYGENASSIST",
+       "ROUNDPS",
+       "ROUNDSS",
+       "ROUNDPD",
+       "ROUNDSD",
        "PSHUFD",
        "PCLMULQDQ",
        "JCXZW",
index 0d2e869df3e79ca38c1b748e61c5bc6132b89704..495b35df187378a09e4f24a0a8da322d6a726cce 100644 (file)
@@ -1474,6 +1474,10 @@ var optab =
        {AAESDECLAST, yaes, Pq, [23]uint8{0x38, 0xdf, 0}},
        {AAESIMC, yaes, Pq, [23]uint8{0x38, 0xdb, 0}},
        {AAESKEYGENASSIST, yaes2, Pq, [23]uint8{0x3a, 0xdf, 0}},
+       {AROUNDPD, yaes2, Pq, [23]uint8{0x3a, 0x09, 0}},
+       {AROUNDPS, yaes2, Pq, [23]uint8{0x3a, 0x08, 0}},
+       {AROUNDSD, yaes2, Pq, [23]uint8{0x3a, 0x0b, 0}},
+       {AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}},
        {APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}},
        {APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}},
        {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
index 67b7cdec04e880c73a514e4384c3ed8a7b56c6b3..7f512e7c2250ae5685b3378486edac7ceddcb1b1 100644 (file)
@@ -6,8 +6,25 @@
 
 #define Big            0x4330000000000000 // 2**52
 
+// func hasSSE4() bool
+// returns whether SSE4.1 is supported
+TEXT ·hasSSE4(SB),NOSPLIT,$0
+       XORQ AX, AX
+       INCL AX
+       CPUID
+       SHRQ $19, CX
+       ANDQ $1, CX
+       MOVB CX, ret+0(FP)
+       RET
+
 // func Floor(x float64) float64
 TEXT ·Floor(SB),NOSPLIT,$0
+       CMPB    math·useSSE4(SB), $1
+       JNE     nosse4
+       ROUNDSD $1, x+0(FP), X0
+       MOVQ X0, ret+8(FP)
+       RET
+nosse4:
        MOVQ    x+0(FP), AX
        MOVQ    $~(1<<63), DX // sign bit mask
        ANDQ    AX,DX // DX = |x|
@@ -30,6 +47,12 @@ isBig_floor:
 
 // func Ceil(x float64) float64
 TEXT ·Ceil(SB),NOSPLIT,$0
+       CMPB    math·useSSE4(SB), $1
+       JNE     nosse4
+       ROUNDSD $2, x+0(FP), X0
+       MOVQ X0, ret+8(FP)
+       RET
+nosse4:
        MOVQ    x+0(FP), AX
        MOVQ    $~(1<<63), DX // sign bit mask
        MOVQ    AX, BX // BX = copy of x
diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go
new file mode 100644 (file)
index 0000000..28e56a5
--- /dev/null
@@ -0,0 +1,12 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 amd64p32
+
+package math
+
+//defined in floor_amd64.s
+func hasSSE4() bool
+
+var useSSE4 = hasSSE4()