]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: optimize the function memmove on loong64
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>
Tue, 14 May 2024 02:59:26 +0000 (10:59 +0800)
committerGopher Robot <gobot@golang.org>
Wed, 11 Sep 2024 21:43:27 +0000 (21:43 +0000)
benchmarck on 3A6000:
goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
                                 │      old      │                 new                  │
                                 │    sec/op     │    sec/op     vs base                │
Memmove/0                           0.6003n ± 0%   0.6003n ± 0%        ~ (p=0.487 n=20)
Memmove/1                            4.402n ± 0%    2.815n ± 0%  -36.05% (p=0.000 n=20)
Memmove/2                            5.202n ± 0%    3.202n ± 0%  -38.45% (p=0.000 n=20)
Memmove/3                            6.003n ± 0%    2.820n ± 0%  -53.02% (p=0.000 n=20)
Memmove/4                            6.803n ± 0%    3.202n ± 0%  -52.93% (p=0.000 n=20)
Memmove/5                            7.604n ± 0%    3.202n ± 0%  -57.89% (p=0.000 n=20)
Memmove/6                            8.404n ± 0%    3.202n ± 0%  -61.90% (p=0.000 n=20)
Memmove/7                            9.204n ± 0%    3.202n ± 0%  -65.21% (p=0.000 n=20)
Memmove/8                            4.802n ± 0%    3.602n ± 0%  -24.99% (p=0.000 n=20)
Memmove/9                            6.003n ± 0%    3.202n ± 0%  -46.66% (p=0.000 n=20)
Memmove/10                           6.803n ± 0%    3.202n ± 0%  -52.93% (p=0.000 n=20)
Memmove/11                           7.604n ± 0%    3.202n ± 0%  -57.89% (p=0.000 n=20)
Memmove/12                           8.404n ± 0%    3.202n ± 0%  -61.90% (p=0.000 n=20)
Memmove/13                           9.204n ± 0%    3.202n ± 0%  -65.21% (p=0.000 n=20)
Memmove/14                          10.000n ± 0%    3.202n ± 0%  -67.98% (p=0.000 n=20)
Memmove/15                          10.810n ± 0%    3.202n ± 0%  -70.38% (p=0.000 n=20)
Memmove/16                           6.003n ± 0%    3.202n ± 0%  -46.66% (p=0.000 n=20)
Memmove/32                           7.604n ± 0%    3.602n ± 0%  -52.63% (p=0.000 n=20)
Memmove/64                          10.810n ± 0%    4.402n ± 0%  -59.28% (p=0.000 n=20)
Memmove/128                         17.210n ± 0%    8.004n ± 0%  -53.49% (p=0.000 n=20)
Memmove/256                          30.41n ± 0%    10.81n ± 0%  -64.45% (p=0.000 n=20)
Memmove/512                          56.03n ± 0%    17.81n ± 0%  -68.21% (p=0.000 n=20)
Memmove/1024                        107.30n ± 0%    30.62n ± 0%  -71.46% (p=0.000 n=20)
Memmove/2048                        209.70n ± 0%    56.23n ± 0%  -73.19% (p=0.000 n=20)
Memmove/4096                         414.6n ± 0%    107.5n ± 0%  -74.07% (p=0.000 n=20)
MemmoveOverlap/32                    8.404n ± 0%    4.402n ± 0%  -47.62% (p=0.000 n=20)
MemmoveOverlap/64                   11.610n ± 0%    5.003n ± 0%  -56.91% (p=0.000 n=20)
MemmoveOverlap/128                  18.010n ± 0%    9.005n ± 0%  -50.00% (p=0.000 n=20)
MemmoveOverlap/256                   31.22n ± 0%    12.41n ± 0%  -60.25% (p=0.000 n=20)
MemmoveOverlap/512                   56.83n ± 0%    19.08n ± 0%  -66.43% (p=0.000 n=20)
MemmoveOverlap/1024                 108.10n ± 0%    32.00n ± 0%  -70.40% (p=0.000 n=20)
MemmoveOverlap/2048                 210.50n ± 0%    57.94n ± 0%  -72.48% (p=0.000 n=20)
MemmoveOverlap/4096                  415.4n ± 0%    108.9n ± 0%  -73.78% (p=0.000 n=20)
MemmoveUnalignedDst/0                2.448n ± 0%    2.942n ± 0%  +20.16% (p=0.000 n=20)
MemmoveUnalignedDst/1                4.802n ± 0%    3.202n ± 0%  -33.32% (p=0.000 n=20)
MemmoveUnalignedDst/2                5.603n ± 0%    3.602n ± 0%  -35.71% (p=0.000 n=20)
MemmoveUnalignedDst/3                6.403n ± 0%    3.202n ± 0%  -49.99% (p=0.000 n=20)
MemmoveUnalignedDst/4                7.203n ± 0%    3.602n ± 0%  -49.99% (p=0.000 n=20)
MemmoveUnalignedDst/5                8.004n ± 0%    3.602n ± 0%  -55.00% (p=0.000 n=20)
MemmoveUnalignedDst/6                8.804n ± 0%    3.602n ± 0%  -59.09% (p=0.000 n=20)
MemmoveUnalignedDst/7                9.605n ± 0%    3.602n ± 0%  -62.50% (p=0.000 n=20)
MemmoveUnalignedDst/8               10.400n ± 0%    4.002n ± 0%  -61.52% (p=0.000 n=20)
MemmoveUnalignedDst/9               11.210n ± 0%    3.802n ± 0%  -66.08% (p=0.000 n=20)
MemmoveUnalignedDst/10              12.010n ± 0%    3.802n ± 0%  -68.34% (p=0.000 n=20)
MemmoveUnalignedDst/11              12.810n ± 0%    3.802n ± 0%  -70.32% (p=0.000 n=20)
MemmoveUnalignedDst/12              13.610n ± 0%    3.802n ± 0%  -72.06% (p=0.000 n=20)
MemmoveUnalignedDst/13              14.410n ± 0%    3.802n ± 0%  -73.62% (p=0.000 n=20)
MemmoveUnalignedDst/14              15.210n ± 0%    3.802n ± 0%  -75.00% (p=0.000 n=20)
MemmoveUnalignedDst/15              16.010n ± 0%    3.802n ± 0%  -76.25% (p=0.000 n=20)
MemmoveUnalignedDst/16              17.210n ± 0%    3.802n ± 0%  -77.91% (p=0.000 n=20)
MemmoveUnalignedDst/32              30.020n ± 0%    4.202n ± 0%  -86.00% (p=0.000 n=20)
MemmoveUnalignedDst/64              56.030n ± 0%    6.804n ± 0%  -87.86% (p=0.000 n=20)
MemmoveUnalignedDst/128             106.90n ± 0%    13.61n ± 0%  -87.27% (p=0.000 n=20)
MemmoveUnalignedDst/256             209.50n ± 0%    17.07n ± 1%  -91.85% (p=0.000 n=20)
MemmoveUnalignedDst/512             414.60n ± 0%    24.95n ± 0%  -93.98% (p=0.000 n=20)
MemmoveUnalignedDst/1024            828.40n ± 0%    42.82n ± 0%  -94.83% (p=0.000 n=20)
MemmoveUnalignedDst/2048           1648.00n ± 0%    78.04n ± 0%  -95.26% (p=0.000 n=20)
MemmoveUnalignedDst/4096            3287.0n ± 0%    148.4n ± 0%  -95.49% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/32       30.810n ± 0%    5.603n ± 0%  -81.81% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/64       56.430n ± 0%    7.604n ± 0%  -86.52% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/128     107.700n ± 0%    9.812n ± 0%  -90.89% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/256      210.10n ± 0%    13.50n ± 0%  -93.57% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/512      415.00n ± 0%    21.21n ± 0%  -94.89% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/1024     828.80n ± 0%    41.02n ± 0%  -95.05% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/2048    1648.00n ± 0%    80.23n ± 0%  -95.13% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/4096     3288.0n ± 0%    162.4n ± 0%  -95.06% (p=0.000 n=20)
MemmoveUnalignedSrc/0                2.468n ± 1%    2.913n ± 0%  +18.01% (p=0.000 n=20)
MemmoveUnalignedSrc/1                4.802n ± 0%    3.202n ± 0%  -33.32% (p=0.000 n=20)
MemmoveUnalignedSrc/2                5.603n ± 0%    3.603n ± 0%  -35.70% (p=0.000 n=20)
MemmoveUnalignedSrc/3                6.403n ± 0%    3.207n ± 0%  -49.91% (p=0.000 n=20)
MemmoveUnalignedSrc/4                7.203n ± 0%    3.603n ± 0%  -49.98% (p=0.000 n=20)
MemmoveUnalignedSrc/5                8.004n ± 0%    3.602n ± 0%  -55.00% (p=0.000 n=20)
MemmoveUnalignedSrc/6                8.804n ± 0%    3.602n ± 0%  -59.09% (p=0.000 n=20)
MemmoveUnalignedSrc/7                9.605n ± 0%    3.602n ± 0%  -62.50% (p=0.000 n=20)
MemmoveUnalignedSrc/8               10.410n ± 0%    4.002n ± 0%  -61.56% (p=0.000 n=20)
MemmoveUnalignedSrc/9               11.210n ± 0%    3.802n ± 0%  -66.08% (p=0.000 n=20)
MemmoveUnalignedSrc/10              12.010n ± 0%    3.802n ± 0%  -68.34% (p=0.000 n=20)
MemmoveUnalignedSrc/11              12.810n ± 0%    3.802n ± 0%  -70.32% (p=0.000 n=20)
MemmoveUnalignedSrc/12              13.610n ± 0%    3.802n ± 0%  -72.06% (p=0.000 n=20)
MemmoveUnalignedSrc/13              14.410n ± 0%    3.802n ± 0%  -73.62% (p=0.000 n=20)
MemmoveUnalignedSrc/14              15.210n ± 0%    3.802n ± 0%  -75.00% (p=0.000 n=20)
MemmoveUnalignedSrc/15              16.010n ± 0%    3.802n ± 0%  -76.25% (p=0.000 n=20)
MemmoveUnalignedSrc/16              16.810n ± 0%    3.802n ± 0%  -77.38% (p=0.000 n=20)
MemmoveUnalignedSrc/32              30.410n ± 0%    4.301n ± 0%  -85.86% (p=0.000 n=20)
MemmoveUnalignedSrc/64              55.630n ± 0%    5.203n ± 0%  -90.65% (p=0.000 n=20)
MemmoveUnalignedSrc/128            107.300n ± 0%    8.805n ± 0%  -91.79% (p=0.000 n=20)
MemmoveUnalignedSrc/256             209.50n ± 0%    12.41n ± 6%  -94.08% (p=0.000 n=20)
MemmoveUnalignedSrc/512             414.20n ± 0%    20.41n ± 0%  -95.07% (p=0.000 n=20)
MemmoveUnalignedSrc/1024            828.00n ± 0%    36.92n ± 0%  -95.54% (p=0.000 n=20)
MemmoveUnalignedSrc/2048           1648.00n ± 0%    71.41n ± 0%  -95.67% (p=0.000 n=20)
MemmoveUnalignedSrc/4096            3287.0n ± 0%    132.2n ± 0%  -95.98% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_0        7.203n ± 0%    5.002n ± 0%  -30.56% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_0        7.604n ± 0%    5.002n ± 0%  -34.22% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_1       13.210n ± 0%    5.002n ± 0%  -62.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_1       13.210n ± 0%    5.002n ± 0%  -62.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_4       13.210n ± 0%    5.002n ± 0%  -62.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_4       13.610n ± 0%    5.002n ± 0%  -63.24% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_7       12.810n ± 0%    5.002n ± 0%  -60.95% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_7       13.610n ± 0%    5.002n ± 0%  -63.24% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_0       12.010n ± 0%    7.191n ± 0%  -40.12% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_0       12.410n ± 0%    7.194n ± 0%  -42.03% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_1       18.410n ± 0%    7.604n ± 0%  -58.70% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_1       18.410n ± 0%    7.604n ± 0%  -58.70% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_4       18.410n ± 0%    7.604n ± 0%  -58.70% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_4       18.810n ± 0%    7.604n ± 0%  -59.57% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_7       18.010n ± 0%    7.604n ± 0%  -57.78% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_7       18.810n ± 0%    7.604n ± 0%  -59.57% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_0       31.62n ± 0%    14.19n ± 0%  -55.12% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_0       32.02n ± 0%    13.61n ± 0%  -57.50% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_1       38.02n ± 0%    18.20n ± 0%  -52.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_1       38.02n ± 0%    18.41n ± 0%  -51.58% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_4       38.02n ± 0%    17.21n ± 0%  -54.73% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_4       38.42n ± 0%    16.81n ± 0%  -56.25% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_7       37.62n ± 0%    15.61n ± 0%  -58.51% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_7       38.42n ± 0%    15.01n ± 0%  -60.93% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_0      415.8n ± 0%    111.1n ± 0%  -73.28% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_0      416.2n ± 0%    110.5n ± 0%  -73.45% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_1      422.2n ± 0%    114.3n ± 0%  -72.93% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_1      422.2n ± 0%    114.7n ± 0%  -72.83% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_4      422.2n ± 0%    113.3n ± 0%  -73.16% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_4      422.6n ± 0%    113.1n ± 0%  -73.24% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_7      421.8n ± 0%    111.7n ± 0%  -73.52% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_7      422.6n ± 0%    111.7n ± 0%  -73.57% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_0     6.568µ ± 0%    4.869µ ± 0%  -25.88% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_0     6.568µ ± 0%    5.009µ ± 0%  -23.74% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_1     6.574µ ± 0%    4.743µ ± 0%  -27.85% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_1     6.574µ ± 0%    4.770µ ± 0%  -27.44% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_4     6.574µ ± 0%    4.758µ ± 0%  -27.63% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_4     6.574µ ± 0%    4.768µ ± 0%  -27.48% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_7     6.574µ ± 0%    4.757µ ± 0%  -27.64% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_7     6.574µ ± 0%    4.583µ ± 0%  -30.29% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/32       30.410n ± 0%    6.804n ± 0%  -77.63% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/64        56.03n ± 0%    10.01n ± 0%  -82.13% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/128      107.30n ± 0%    14.01n ± 0%  -86.94% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/256      209.70n ± 0%    13.43n ± 1%  -93.60% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/512      414.60n ± 0%    22.23n ± 0%  -94.64% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/1024     828.40n ± 0%    37.62n ± 0%  -95.46% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/2048    1648.00n ± 0%    68.04n ± 0%  -95.87% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/4096     3287.0n ± 0%    128.9n ± 0%  -96.08% (p=0.000 n=20)
geomean                              48.94n         13.58n       -72.26%

The relevant performance improved by 72.26%.

Change-Id: If2d3e09c3d687e733e6ff2c50feb8d6a8eb7e63b
Reviewed-on: https://go-review.googlesource.com/c/go/+/589537
Reviewed-by: Tim King <taking@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Tim King <taking@google.com>

src/runtime/memmove_loong64.s

index a94cf999bc4af9120478f0e6e8d7e60d9a962157..8827ca0742a4e39a17e65fd8af8b424fc81f83ae 100644 (file)
 
 // See memmove Go doc for important implementation constraints.
 
+// Register map
+//
+// to          R4
+// from                R5
+// n(aka count)        R6
+// to-end      R7
+// from-end    R8
+// data                R11-R18
+// tmp         R9
+
+// Algorithm:
+//
+// Memory alignment check is only performed for copy size greater
+// than 64 bytes to minimize overhead.
+//
+// when copy size <= 64 bytes, jump to label tail, according to the
+// copy size to select the appropriate case and copy directly.
+// Based on the common memory access instructions of loong64, the
+// currently implemented cases are:
+// move_0, move_1, move_2, move_3, move_4, move_5through7, move_8,
+// move_9through16, move_17through32, move_33through64
+//
+// when copy size > 64 bytes, use the destination-aligned copying,
+// adopt the following strategy to copy in 3 parts:
+// 1. Head: do the memory alignment
+// 2. Body: a 64-byte loop structure
+// 3. Tail: processing of the remaining part (<= 64 bytes)
+//
+// forward:
+//
+//    Dst           NewDst                           Dstend
+//     |               |<----count after correction---->|
+//     |<-------------count before correction---------->|
+//     |<--8-(Dst&7)-->|               |<---64 bytes--->|
+//     +------------------------------------------------+
+//     |   Head        |      Body     |      Tail      |
+//     +---------------+---------------+----------------+
+//    NewDst = Dst - (Dst & 7) + 8
+//    count = count - 8 + (Dst & 7)
+//    Src = Src - (Dst & 7) + 8
+//
+// backward:
+//
+//    Dst                             NewDstend          Dstend
+//     |<-----count after correction------>|                |
+//     |<------------count before correction--------------->|
+//     |<---64 bytes--->|                  |<---Dstend&7--->|
+//     +----------------------------------------------------+
+//     |   Tail         |      Body        |      Head      |
+//     +----------------+------------------+----------------+
+//    NewDstend = Dstend - (Dstend & 7)
+//    count = count - (Dstend & 7)
+//    Srcend = Srcend - (Dstend & 7)
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
-       BNE     R6, check
-       RET
+       BEQ     R4, R5, move_0
+       BEQ     R6, move_0
 
-check:
-       SGTU    R4, R5, R7
-       BNE     R7, backward
+       ADDV    R4, R6, R7 // to-end pointer
+       ADDV    R5, R6, R8 // from-end pointer
 
-       ADDV    R4, R6, R9 // end pointer
+tail:
+       //copy size <= 64 bytes, copy directly, not check aligned
 
-       // if the two pointers are not of same alignments, do byte copying
-       SUBVU   R5, R4, R7
-       AND     $7, R7
-       BNE     R7, out
+       // < 2 bytes
+       SGTU    $2, R6, R9
+       BNE     R9, move_1
 
-       // if less than 8 bytes, do byte copying
-       SGTU    $8, R6, R7
-       BNE     R7, out
+       // < 3 bytes
+       SGTU    $3, R6, R9
+       BNE     R9, move_2
 
-       // do one byte at a time until 8-aligned
-       AND     $7, R4, R8
-       BEQ     R8, words
-       MOVB    (R5), R7
-       ADDV    $1, R5
-       MOVB    R7, (R4)
-       ADDV    $1, R4
-       JMP     -6(PC)
-
-words:
-       // do 8 bytes at a time if there is room
-       ADDV    $-7, R9, R6 // R6 is end pointer-7
-
-       PCALIGN $16
-       SGTU    R6, R4, R8
-       BEQ     R8, out
-       MOVV    (R5), R7
-       ADDV    $8, R5
-       MOVV    R7, (R4)
-       ADDV    $8, R4
-       JMP     -6(PC)
-
-out:
-       BEQ     R4, R9, done
-       MOVB    (R5), R7
+       // < 4 bytes
+       SGTU    $4, R6, R9
+       BNE     R9, move_3
+
+       // < 5 bytes
+       SGTU    $5, R6, R9
+       BNE     R9, move_4
+
+       // >= 5 bytes and < 8 bytes
+       SGTU    $8, R6, R9
+       BNE     R9, move_5through7
+
+       // < 9 bytes
+       SGTU    $9, R6, R9
+       BNE     R9, move_8
+
+       // >= 9 bytes and < 17 bytes
+       SGTU    $17, R6, R9
+       BNE     R9, move_9through16
+
+       // >= 17 bytes and < 33 bytes
+       SGTU    $33, R6, R9
+       BNE     R9, move_17through32
+
+       // >= 33 bytes and < 65 bytes
+       SGTU    $65, R6, R9
+       BNE     R9, move_33through64
+
+       // if (dst > src) && (dst < src + count), regarded as memory
+       // overlap, jump to backward
+       // else, jump to forward
+       BGEU    R5, R4, forward
+       ADDV    R5, R6, R10
+       BLTU    R4, R10, backward
+
+forward:
+       AND     $7, R4, R9      // dst & 7
+       BEQ     R9, body
+head:
+       MOVV    $8, R10
+       SUBV    R9, R10         // head = 8 - (dst & 7)
+       MOVB    (R5), R11
+       SUBV    $1, R10
        ADDV    $1, R5
-       MOVB    R7, (R4)
+       MOVB    R11, (R4)
        ADDV    $1, R4
-       JMP     -5(PC)
-done:
-       RET
+       BNE     R10, -5(PC)
+       ADDV    R9, R6
+       ADDV    $-8, R6         // newcount = count + (dst & 7) - 8
+       // if newcount < 65 bytes, use move_33through64 to copy is enough
+       SGTU    $65, R6, R9
+       BNE     R9, move_33through64
 
+body:
+       MOVV    (R5), R11
+       MOVV    8(R5), R12
+       MOVV    16(R5), R13
+       MOVV    24(R5), R14
+       MOVV    32(R5), R15
+       MOVV    40(R5), R16
+       MOVV    48(R5), R17
+       MOVV    56(R5), R18
+       MOVV    R11, (R4)
+       MOVV    R12, 8(R4)
+       MOVV    R13, 16(R4)
+       MOVV    R14, 24(R4)
+       MOVV    R15, 32(R4)
+       MOVV    R16, 40(R4)
+       MOVV    R17, 48(R4)
+       MOVV    R18, 56(R4)
+       ADDV    $-64, R6
+       ADDV    $64, R4
+       ADDV    $64, R5
+       SGTU    $64, R6, R9
+       // if the remaining part >= 64 bytes, jmp to body
+       BEQ     R9, body
+       // if the remaining part == 0 bytes, use move_0 to return
+       BEQ     R6, move_0
+       // if the remaining part in (0, 63] bytes, jmp to tail
+       JMP     tail
+
+// The backward copy algorithm is the same as the forward copy,
+// except for the direction.
 backward:
-       ADDV    R6, R5 // from-end pointer
-       ADDV    R4, R6, R9 // to-end pointer
-
-       // if the two pointers are not of same alignments, do byte copying
-       SUBVU   R9, R5, R7
-       AND     $7, R7
-       BNE     R7, out1
-
-       // if less than 8 bytes, do byte copying
-       SGTU    $8, R6, R7
-       BNE     R7, out1
-
-       // do one byte at a time until 8-aligned
-       AND     $7, R9, R8
-       BEQ     R8, words1
-       ADDV    $-1, R5
-       MOVB    (R5), R7
-       ADDV    $-1, R9
-       MOVB    R7, (R9)
-       JMP     -6(PC)
-
-words1:
-       // do 8 bytes at a time if there is room
-       ADDV    $7, R4, R6 // R6 is start pointer+7
-
-       PCALIGN $16
-       SGTU    R9, R6, R8
-       BEQ     R8, out1
-       ADDV    $-8, R5
-       MOVV    (R5), R7
-       ADDV    $-8, R9
-       MOVV    R7, (R9)
-       JMP     -6(PC)
-
-out1:
-       BEQ     R4, R9, done1
-       ADDV    $-1, R5
-       MOVB    (R5), R7
-       ADDV    $-1, R9
-       MOVB    R7, (R9)
-       JMP     -5(PC)
-done1:
+       AND     $7, R7, R9       // dstend & 7
+       BEQ     R9, b_body
+b_head:
+       MOVV    -8(R8), R11
+       SUBV    R9, R6          // newcount = count - (dstend & 7)
+       SUBV    R9, R8          // newsrcend = srcend - (dstend & 7)
+       MOVV    -8(R8), R12
+       MOVV    R11, -8(R7)
+       SUBV    R9, R7          // newdstend = dstend - (dstend & 7)
+       MOVV    R12, -8(R7)
+       SUBV    $8, R6
+       SUBV    $8, R7
+       SUBV    $8, R8
+       SGTU    $65, R6, R9
+       BNE     R9, move_33through64
+
+b_body:
+       MOVV    -8(R8), R11
+       MOVV    -16(R8), R12
+       MOVV    -24(R8), R13
+       MOVV    -32(R8), R14
+       MOVV    -40(R8), R15
+       MOVV    -48(R8), R16
+       MOVV    -56(R8), R17
+       MOVV    -64(R8), R18
+       MOVV    R11, -8(R7)
+       MOVV    R12, -16(R7)
+       MOVV    R13, -24(R7)
+       MOVV    R14, -32(R7)
+       MOVV    R15, -40(R7)
+       MOVV    R16, -48(R7)
+       MOVV    R17, -56(R7)
+       MOVV    R18, -64(R7)
+       ADDV    $-64, R6
+       ADDV    $-64, R7
+       ADDV    $-64, R8
+       SGTU    $64, R6, R9
+       BEQ     R9, b_body
+       BEQ     R6, move_0
+       JMP     tail
+
+move_0:
+       RET
+
+move_1:
+       MOVB    (R5), R11
+       MOVB    R11, (R4)
+       RET
+move_2:
+       MOVH    (R5), R11
+       MOVH    R11, (R4)
+       RET
+move_3:
+       MOVH    (R5), R11
+       MOVB    -1(R8), R12
+       MOVH    R11, (R4)
+       MOVB    R12, -1(R7)
+       RET
+move_4:
+       MOVW    (R5), R11
+       MOVW    R11, (R4)
+       RET
+move_5through7:
+       MOVW    (R5), R11
+       MOVW    -4(R8), R12
+       MOVW    R11, (R4)
+       MOVW    R12, -4(R7)
+       RET
+move_8:
+       MOVV    (R5), R11
+       MOVV    R11, (R4)
+       RET
+move_9through16:
+       MOVV    (R5), R11
+       MOVV    -8(R8), R12
+       MOVV    R11, (R4)
+       MOVV    R12, -8(R7)
+       RET
+move_17through32:
+       MOVV    (R5), R11
+       MOVV    8(R5), R12
+       MOVV    -16(R8), R13
+       MOVV    -8(R8), R14
+       MOVV    R11, (R4)
+       MOVV    R12, 8(R4)
+       MOVV    R13, -16(R7)
+       MOVV    R14, -8(R7)
+       RET
+move_33through64:
+       MOVV    (R5), R11
+       MOVV    8(R5), R12
+       MOVV    16(R5), R13
+       MOVV    24(R5), R14
+       MOVV    -32(R8), R15
+       MOVV    -24(R8), R16
+       MOVV    -16(R8), R17
+       MOVV    -8(R8), R18
+       MOVV    R11, (R4)
+       MOVV    R12, 8(R4)
+       MOVV    R13, 16(R4)
+       MOVV    R14, 24(R4)
+       MOVV    R15, -32(R7)
+       MOVV    R16, -24(R7)
+       MOVV    R17, -16(R7)
+       MOVV    R18, -8(R7)
        RET