From 3436f0776f4f373b5ba1aacf9f66689c833168b0 Mon Sep 17 00:00:00 2001 From: Michael Munday Date: Tue, 30 Aug 2016 15:25:28 -0400 Subject: [PATCH] reflect, runtime: optimize Value.Call on s390x and add benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use an MVC loop to copy arguments in runtime.call* rather than copying bytes individually. I've added the benchmark CallArgCopy to test the speed of Value.Call for various argument sizes. name old speed new speed delta CallArgCopy/size=128 439MB/s ± 1% 582MB/s ± 1% +32.41% (p=0.000 n=10+10) CallArgCopy/size=256 695MB/s ± 1% 1172MB/s ± 1% +68.67% (p=0.000 n=10+10) CallArgCopy/size=1024 573MB/s ± 8% 4175MB/s ± 2% +628.11% (p=0.000 n=10+10) CallArgCopy/size=4096 1.46GB/s ± 2% 10.19GB/s ± 1% +600.52% (p=0.000 n=10+10) CallArgCopy/size=65536 1.51GB/s ± 0% 12.30GB/s ± 1% +716.30% (p=0.000 n=9+10) Change-Id: I87dae4809330e7964f6cb4a9e40e5b3254dd519d Reviewed-on: https://go-review.googlesource.com/28096 Run-TryBot: Michael Munday TryBot-Result: Gobot Gobot Reviewed-by: Bill O'Farrell Reviewed-by: Brad Fitzpatrick --- src/reflect/all_test.go | 28 +++++++++++++ src/runtime/asm_s390x.s | 88 ++++++++++++++++++++++------------------- 2 files changed, 75 insertions(+), 41 deletions(-) diff --git a/src/reflect/all_test.go b/src/reflect/all_test.go index 780799cc6d..3bf24d2250 100644 --- a/src/reflect/all_test.go +++ b/src/reflect/all_test.go @@ -1535,6 +1535,34 @@ func BenchmarkCall(b *testing.B) { }) } +func BenchmarkCallArgCopy(b *testing.B) { + byteArray := func(n int) Value { + return Zero(ArrayOf(n, TypeOf(byte(0)))) + } + sizes := [...]struct { + fv Value + arg Value + }{ + {ValueOf(func(a [128]byte) {}), byteArray(128)}, + {ValueOf(func(a [256]byte) {}), byteArray(256)}, + {ValueOf(func(a [1024]byte) {}), byteArray(1024)}, + {ValueOf(func(a [4096]byte) {}), byteArray(4096)}, + {ValueOf(func(a [65536]byte) {}), byteArray(65536)}, + } + for _, size := range sizes { + bench := func(b *testing.B) { + args := []Value{size.arg} + b.SetBytes(int64(size.arg.Len())) + b.ResetTimer() + for i := 0; i < b.N; i++ { + size.fv.Call(args) + } + } + name := fmt.Sprintf("size=%v", size.arg.Len()) + b.Run(name, bench) + } +} + func TestMakeFunc(t *testing.T) { f := dummy fv := MakeFunc(TypeOf(f), func(in []Value) []Value { return in }) diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s index 6d0533a3dc..70e3b5e859 100644 --- a/src/runtime/asm_s390x.s +++ b/src/runtime/asm_s390x.s @@ -387,53 +387,55 @@ TEXT ·reflectcall(SB), NOSPLIT, $-8-32 TEXT NAME(SB), WRAPPER, $MAXSIZE-24; \ NO_LOCAL_POINTERS; \ /* copy arguments to stack */ \ - MOVD arg+16(FP), R3; \ - MOVWZ argsize+24(FP), R4; \ - MOVD R15, R5; \ - ADD $(8-1), R5; \ - SUB $1, R3; \ - ADD R5, R4; \ - CMP R5, R4; \ - BEQ 6(PC); \ - ADD $1, R3; \ - ADD $1, R5; \ - MOVBZ 0(R3), R6; \ - MOVBZ R6, 0(R5); \ - BR -6(PC); \ - /* call function */ \ + MOVD arg+16(FP), R4; \ + MOVWZ argsize+24(FP), R5; \ + MOVD $stack-MAXSIZE(SP), R6; \ +loopArgs: /* copy 256 bytes at a time */ \ + CMP R5, $256; \ + BLT tailArgs; \ + SUB $256, R5; \ + MVC $256, 0(R4), 0(R6); \ + MOVD $256(R4), R4; \ + MOVD $256(R6), R6; \ + BR loopArgs; \ +tailArgs: /* copy remaining bytes */ \ + CMP R5, $0; \ + BEQ callFunction; \ + SUB $1, R5; \ + EXRL $callfnMVC<>(SB), R5; \ +callFunction: \ MOVD f+8(FP), R12; \ MOVD (R12), R8; \ PCDATA $PCDATA_StackMapIndex, $0; \ BL (R8); \ /* copy return values back */ \ - MOVD arg+16(FP), R3; \ - MOVWZ n+24(FP), R4; \ - MOVWZ retoffset+28(FP), R6; \ - MOVD R15, R5; \ - ADD R6, R5; \ - ADD R6, R3; \ - SUB R6, R4; \ - ADD $(8-1), R5; \ - SUB $1, R3; \ - ADD R5, R4; \ -loop: \ - CMP R5, R4; \ - BEQ end; \ - ADD $1, R5; \ - ADD $1, R3; \ - MOVBZ 0(R5), R6; \ - MOVBZ R6, 0(R3); \ - BR loop; \ -end: \ + MOVD arg+16(FP), R6; \ + MOVWZ n+24(FP), R5; \ + MOVD $stack-MAXSIZE(SP), R4; \ + MOVWZ retoffset+28(FP), R1; \ + ADD R1, R4; \ + ADD R1, R6; \ + SUB R1, R5; \ +loopRets: /* copy 256 bytes at a time */ \ + CMP R5, $256; \ + BLT tailRets; \ + SUB $256, R5; \ + MVC $256, 0(R4), 0(R6); \ + MOVD $256(R4), R4; \ + MOVD $256(R6), R6; \ + BR loopRets; \ +tailRets: /* copy remaining bytes */ \ + CMP R5, $0; \ + BEQ writeBarrierUpdates; \ + SUB $1, R5; \ + EXRL $callfnMVC<>(SB), R5; \ +writeBarrierUpdates: \ /* execute write barrier updates */ \ - MOVD argtype+0(FP), R7; \ - MOVD arg+16(FP), R3; \ - MOVWZ n+24(FP), R4; \ - MOVWZ retoffset+28(FP), R6; \ - MOVD R7, 8(R15); \ - MOVD R3, 16(R15); \ - MOVD R4, 24(R15); \ - MOVD R6, 32(R15); \ + MOVD argtype+0(FP), R1; \ + MOVD arg+16(FP), R2; \ + MOVWZ n+24(FP), R3; \ + MOVWZ retoffset+28(FP), R4; \ + STMG R1, R4, stack-MAXSIZE(SP); \ BL runtime·callwritebarrier(SB); \ RET @@ -464,6 +466,10 @@ CALLFN(·call268435456, 268435456) CALLFN(·call536870912, 536870912) CALLFN(·call1073741824, 1073741824) +// Not a function: target for EXRL (execute relative long) instruction. +TEXT callfnMVC<>(SB),NOSPLIT|NOFRAME,$0-0 + MVC $1, 0(R4), 0(R6) + TEXT runtime·procyield(SB),NOSPLIT,$0-0 RET -- 2.48.1