]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize Equal on arm64
authorerifan01 <eric.fang@arm.com>
Mon, 7 May 2018 08:08:30 +0000 (08:08 +0000)
committerCherry Zhang <cherryyz@google.com>
Wed, 12 Sep 2018 19:56:55 +0000 (19:56 +0000)
Currently the 16-byte loop chunk16_loop is implemented with NEON instructions LD1, VMOV and VCMEQ.
Using scalar instructions LDP and CMP to achieve this loop can reduce the number of clock cycles.
For cases where the length of strings are between 4 to 15 bytes, loading the last 8 or 4 bytes at
a time to reduce the number of comparisons.

Benchmarks:
name                 old time/op    new time/op    delta
Equal/0-8              5.51ns ± 0%    5.84ns ±14%     ~     (p=0.246 n=7+8)
Equal/1-8              10.5ns ± 0%    10.5ns ± 0%     ~     (all equal)
Equal/6-8              14.0ns ± 0%    12.5ns ± 0%  -10.71%  (p=0.000 n=8+8)
Equal/9-8              13.5ns ± 0%    12.5ns ± 0%   -7.41%  (p=0.000 n=8+8)
Equal/15-8             15.5ns ± 0%    12.5ns ± 0%  -19.35%  (p=0.000 n=8+8)
Equal/16-8             14.0ns ± 0%    13.0ns ± 0%   -7.14%  (p=0.000 n=8+8)
Equal/20-8             16.5ns ± 0%    16.0ns ± 0%   -3.03%  (p=0.000 n=8+8)
Equal/32-8             16.5ns ± 0%    15.3ns ± 0%   -7.27%  (p=0.000 n=8+8)
Equal/4K-8              552ns ± 0%     553ns ± 0%     ~     (p=0.315 n=8+8)
Equal/4M-8             1.13ms ±23%    1.20ms ±27%     ~     (p=0.442 n=8+8)
Equal/64M-8            32.9ms ± 0%    32.6ms ± 0%   -1.15%  (p=0.000 n=8+8)
CompareBytesEqual-8    12.0ns ± 0%    12.0ns ± 0%     ~     (all equal)

Change-Id: If317ecdcc98e31883d37fd7d42b113b548c5bd2a
Reviewed-on: https://go-review.googlesource.com/112496
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>

src/internal/bytealg/equal_arm64.s

index 30abd980c5166c9523b2cab7eb92948b9507638d..dd4840dae1339a7d5acffff4750e612929ec8084 100644 (file)
@@ -67,6 +67,7 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
        CMP     R3, R4
        BEQ     eq
        MOVD    8(R26), R5    // compiler stores size at offset 8 in the closure
+       CBZ     R5, eq
        MOVD    R3, 8(RSP)
        MOVD    R4, 16(RSP)
        MOVD    R5, 24(RSP)
@@ -119,30 +120,41 @@ chunk16:
        CBZ     R3, tail
        ADD     R3, R0, R6      // end of chunks
 chunk16_loop:
-       VLD1.P  (R0), [V0.D2]
-       VLD1.P  (R2), [V1.D2]
-       VCMEQ   V0.D2, V1.D2, V2.D2
+       LDP.P   16(R0), (R4, R5)
+       LDP.P   16(R2), (R7, R9)
+       EOR     R4, R7
+       CBNZ    R7, not_equal
+       EOR     R5, R9
+       CBNZ    R9, not_equal
        CMP     R0, R6
-       VMOV    V2.D[0], R4
-       VMOV    V2.D[1], R5
-       CBZ     R4, not_equal
-       CBZ     R5, not_equal
        BNE     chunk16_loop
        AND     $0xf, R1, R1
        CBZ     R1, equal
 tail:
        // special compare of tail with length < 16
        TBZ     $3, R1, lt_8
-       MOVD.P  8(R0), R4
-       MOVD.P  8(R2), R5
-       CMP     R4, R5
-       BNE     not_equal
+       MOVD    (R0), R4
+       MOVD    (R2), R5
+       EOR     R4, R5
+       CBNZ    R5, not_equal
+       SUB     $8, R1, R6      // offset of the last 8 bytes
+       MOVD    (R0)(R6), R4
+       MOVD    (R2)(R6), R5
+       EOR     R4, R5
+       CBNZ    R5, not_equal
+       B       equal
 lt_8:
        TBZ     $2, R1, lt_4
-       MOVWU.P 4(R0), R4
-       MOVWU.P 4(R2), R5
-       CMP     R4, R5
-       BNE     not_equal
+       MOVWU   (R0), R4
+       MOVWU   (R2), R5
+       EOR     R4, R5
+       CBNZ    R5, not_equal
+       SUB     $4, R1, R6      // offset of the last 4 bytes
+       MOVWU   (R0)(R6), R4
+       MOVWU   (R2)(R6), R5
+       EOR     R4, R5
+       CBNZ    R5, not_equal
+       B       equal
 lt_4:
        TBZ     $1, R1, lt_2
        MOVHU.P 2(R0), R4
@@ -150,7 +162,7 @@ lt_4:
        CMP     R4, R5
        BNE     not_equal
 lt_2:
-       TBZ     $0, R1, equal
+       TBZ     $0, R1, equal
 one:
        MOVBU   (R0), R4
        MOVBU   (R2), R5