From: Ilya Tocar Date: Mon, 14 Sep 2015 15:42:39 +0000 (+0300) Subject: math: optimize ceil/floor functions on amd64 X-Git-Tag: go1.6beta1~920 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=37cfb2e07e9e9e8b11f21ea462856aeb1f6ec0c0;p=gostls13.git math: optimize ceil/floor functions on amd64 Use SSE 4.1 rounding instruction to perform rounding Results (haswell): name old time/op new time/op delta Floor-48 2.71ns ± 0% 1.87ns ± 1% -31.17% (p=0.000 n=16+19) Ceil-48 3.09ns ± 3% 2.16ns ± 0% -30.16% (p=0.000 n=19+12) Change-Id: If63715879eed6530b1eb4fc96132d827f8f43909 Reviewed-on: https://go-review.googlesource.com/14561 Reviewed-by: Klaus Post Run-TryBot: Brad Fitzpatrick TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index c7f46e1801..95868a8ba9 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -734,6 +734,11 @@ const ( AAESIMC AAESKEYGENASSIST + AROUNDPS + AROUNDSS + AROUNDPD + AROUNDSD + APSHUFD APCLMULQDQ diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index 7f7708cdcc..330e816119 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -677,6 +677,10 @@ var Anames = []string{ "AESDECLAST", "AESIMC", "AESKEYGENASSIST", + "ROUNDPS", + "ROUNDSS", + "ROUNDPD", + "ROUNDSD", "PSHUFD", "PCLMULQDQ", "JCXZW", diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 0d2e869df3..495b35df18 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -1474,6 +1474,10 @@ var optab = {AAESDECLAST, yaes, Pq, [23]uint8{0x38, 0xdf, 0}}, {AAESIMC, yaes, Pq, [23]uint8{0x38, 0xdb, 0}}, {AAESKEYGENASSIST, yaes2, Pq, [23]uint8{0x3a, 0xdf, 0}}, + {AROUNDPD, yaes2, Pq, [23]uint8{0x3a, 0x09, 0}}, + {AROUNDPS, yaes2, Pq, [23]uint8{0x3a, 0x08, 0}}, + {AROUNDSD, yaes2, Pq, [23]uint8{0x3a, 0x0b, 0}}, + {AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}}, {APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}}, {APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}}, {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}}, diff --git a/src/math/floor_amd64.s b/src/math/floor_amd64.s index 67b7cdec04..7f512e7c22 100644 --- a/src/math/floor_amd64.s +++ b/src/math/floor_amd64.s @@ -6,8 +6,25 @@ #define Big 0x4330000000000000 // 2**52 +// func hasSSE4() bool +// returns whether SSE4.1 is supported +TEXT ·hasSSE4(SB),NOSPLIT,$0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $19, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + // func Floor(x float64) float64 TEXT ·Floor(SB),NOSPLIT,$0 + CMPB math·useSSE4(SB), $1 + JNE nosse4 + ROUNDSD $1, x+0(FP), X0 + MOVQ X0, ret+8(FP) + RET +nosse4: MOVQ x+0(FP), AX MOVQ $~(1<<63), DX // sign bit mask ANDQ AX,DX // DX = |x| @@ -30,6 +47,12 @@ isBig_floor: // func Ceil(x float64) float64 TEXT ·Ceil(SB),NOSPLIT,$0 + CMPB math·useSSE4(SB), $1 + JNE nosse4 + ROUNDSD $2, x+0(FP), X0 + MOVQ X0, ret+8(FP) + RET +nosse4: MOVQ x+0(FP), AX MOVQ $~(1<<63), DX // sign bit mask MOVQ AX, BX // BX = copy of x diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go new file mode 100644 index 0000000000..28e56a5d51 --- /dev/null +++ b/src/math/floor_asm.go @@ -0,0 +1,12 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 amd64p32 + +package math + +//defined in floor_amd64.s +func hasSSE4() bool + +var useSSE4 = hasSSE4()