]> Cypherpunks repositories - gostls13.git/commitdiff
reflect, runtime: optimize Value.Call on s390x and add benchmark
authorMichael Munday <munday@ca.ibm.com>
Tue, 30 Aug 2016 19:25:28 +0000 (15:25 -0400)
committerMichael Munday <munday@ca.ibm.com>
Tue, 27 Sep 2016 16:00:19 +0000 (16:00 +0000)
Use an MVC loop to copy arguments in runtime.call* rather than copying
bytes individually.

I've added the benchmark CallArgCopy to test the speed of Value.Call
for various argument sizes.

name                    old speed      new speed       delta
CallArgCopy/size=128     439MB/s ± 1%    582MB/s ± 1%   +32.41%  (p=0.000 n=10+10)
CallArgCopy/size=256     695MB/s ± 1%   1172MB/s ± 1%   +68.67%  (p=0.000 n=10+10)
CallArgCopy/size=1024    573MB/s ± 8%   4175MB/s ± 2%  +628.11%  (p=0.000 n=10+10)
CallArgCopy/size=4096   1.46GB/s ± 2%  10.19GB/s ± 1%  +600.52%  (p=0.000 n=10+10)
CallArgCopy/size=65536  1.51GB/s ± 0%  12.30GB/s ± 1%  +716.30%   (p=0.000 n=9+10)

Change-Id: I87dae4809330e7964f6cb4a9e40e5b3254dd519d
Reviewed-on: https://go-review.googlesource.com/28096
Run-TryBot: Michael Munday <munday@ca.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Bill O'Farrell <billotosyr@gmail.com>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
src/reflect/all_test.go
src/runtime/asm_s390x.s

index 780799cc6dc9d69477ad33ff629b2cce93733a23..3bf24d2250bc54d4df522dc0372204a3a3c33200 100644 (file)
@@ -1535,6 +1535,34 @@ func BenchmarkCall(b *testing.B) {
        })
 }
 
+func BenchmarkCallArgCopy(b *testing.B) {
+       byteArray := func(n int) Value {
+               return Zero(ArrayOf(n, TypeOf(byte(0))))
+       }
+       sizes := [...]struct {
+               fv  Value
+               arg Value
+       }{
+               {ValueOf(func(a [128]byte) {}), byteArray(128)},
+               {ValueOf(func(a [256]byte) {}), byteArray(256)},
+               {ValueOf(func(a [1024]byte) {}), byteArray(1024)},
+               {ValueOf(func(a [4096]byte) {}), byteArray(4096)},
+               {ValueOf(func(a [65536]byte) {}), byteArray(65536)},
+       }
+       for _, size := range sizes {
+               bench := func(b *testing.B) {
+                       args := []Value{size.arg}
+                       b.SetBytes(int64(size.arg.Len()))
+                       b.ResetTimer()
+                       for i := 0; i < b.N; i++ {
+                               size.fv.Call(args)
+                       }
+               }
+               name := fmt.Sprintf("size=%v", size.arg.Len())
+               b.Run(name, bench)
+       }
+}
+
 func TestMakeFunc(t *testing.T) {
        f := dummy
        fv := MakeFunc(TypeOf(f), func(in []Value) []Value { return in })
index 6d0533a3dc8f8609b554440ee2bd22c7ef1af90b..70e3b5e85925968841e2b114828a3fb28c7cb977 100644 (file)
@@ -387,53 +387,55 @@ TEXT ·reflectcall(SB), NOSPLIT, $-8-32
 TEXT NAME(SB), WRAPPER, $MAXSIZE-24;           \
        NO_LOCAL_POINTERS;                      \
        /* copy arguments to stack */           \
-       MOVD    arg+16(FP), R3;                 \
-       MOVWZ   argsize+24(FP), R4;                     \
-       MOVD    R15, R5;                                \
-       ADD     $(8-1), R5;                     \
-       SUB     $1, R3;                         \
-       ADD     R5, R4;                         \
-       CMP     R5, R4;                         \
-       BEQ     6(PC);                          \
-       ADD     $1, R3;                         \
-       ADD     $1, R5;                         \
-       MOVBZ   0(R3), R6;                      \
-       MOVBZ   R6, 0(R5);                      \
-       BR      -6(PC);                         \
-       /* call function */                     \
+       MOVD    arg+16(FP), R4;                 \
+       MOVWZ   argsize+24(FP), R5;             \
+       MOVD    $stack-MAXSIZE(SP), R6;         \
+loopArgs: /* copy 256 bytes at a time */       \
+       CMP     R5, $256;                       \
+       BLT     tailArgs;                       \
+       SUB     $256, R5;                       \
+       MVC     $256, 0(R4), 0(R6);             \
+       MOVD    $256(R4), R4;                   \
+       MOVD    $256(R6), R6;                   \
+       BR      loopArgs;                       \
+tailArgs: /* copy remaining bytes */           \
+       CMP     R5, $0;                         \
+       BEQ     callFunction;                   \
+       SUB     $1, R5;                         \
+       EXRL    $callfnMVC<>(SB), R5;           \
+callFunction:                                  \
        MOVD    f+8(FP), R12;                   \
        MOVD    (R12), R8;                      \
        PCDATA  $PCDATA_StackMapIndex, $0;      \
        BL      (R8);                           \
        /* copy return values back */           \
-       MOVD    arg+16(FP), R3;                 \
-       MOVWZ   n+24(FP), R4;                   \
-       MOVWZ   retoffset+28(FP), R6;           \
-       MOVD    R15, R5;                                \
-       ADD     R6, R5;                         \
-       ADD     R6, R3;                         \
-       SUB     R6, R4;                         \
-       ADD     $(8-1), R5;                     \
-       SUB     $1, R3;                         \
-       ADD     R5, R4;                         \
-loop:                                          \
-       CMP     R5, R4;                         \
-       BEQ     end;                            \
-       ADD     $1, R5;                         \
-       ADD     $1, R3;                         \
-       MOVBZ   0(R5), R6;                      \
-       MOVBZ   R6, 0(R3);                      \
-       BR      loop;                           \
-end:                                           \
+       MOVD    arg+16(FP), R6;                 \
+       MOVWZ   n+24(FP), R5;                   \
+       MOVD    $stack-MAXSIZE(SP), R4;         \
+       MOVWZ   retoffset+28(FP), R1;           \
+       ADD     R1, R4;                         \
+       ADD     R1, R6;                         \
+       SUB     R1, R5;                         \
+loopRets: /* copy 256 bytes at a time */       \
+       CMP     R5, $256;                       \
+       BLT     tailRets;                       \
+       SUB     $256, R5;                       \
+       MVC     $256, 0(R4), 0(R6);             \
+       MOVD    $256(R4), R4;                   \
+       MOVD    $256(R6), R6;                   \
+       BR      loopRets;                       \
+tailRets: /* copy remaining bytes */           \
+       CMP     R5, $0;                         \
+       BEQ     writeBarrierUpdates;            \
+       SUB     $1, R5;                         \
+       EXRL    $callfnMVC<>(SB), R5;           \
+writeBarrierUpdates:                           \
        /* execute write barrier updates */     \
-       MOVD    argtype+0(FP), R7;              \
-       MOVD    arg+16(FP), R3;                 \
-       MOVWZ   n+24(FP), R4;                   \
-       MOVWZ   retoffset+28(FP), R6;           \
-       MOVD    R7, 8(R15);                     \
-       MOVD    R3, 16(R15);                    \
-       MOVD    R4, 24(R15);                    \
-       MOVD    R6, 32(R15);                    \
+       MOVD    argtype+0(FP), R1;              \
+       MOVD    arg+16(FP), R2;                 \
+       MOVWZ   n+24(FP), R3;                   \
+       MOVWZ   retoffset+28(FP), R4;           \
+       STMG    R1, R4, stack-MAXSIZE(SP);      \
        BL      runtime·callwritebarrier(SB);  \
        RET
 
@@ -464,6 +466,10 @@ CALLFN(·call268435456, 268435456)
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
+// Not a function: target for EXRL (execute relative long) instruction.
+TEXT callfnMVC<>(SB),NOSPLIT|NOFRAME,$0-0
+       MVC     $1, 0(R4), 0(R6)
+
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
        RET