]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize Index/IndexString in amd64
authorMauri de Souza Meneguzzo <mauri870@gmail.com>
Sun, 6 Aug 2023 23:34:18 +0000 (23:34 +0000)
committerGopher Robot <gobot@golang.org>
Mon, 7 Aug 2023 00:20:48 +0000 (00:20 +0000)
Now with PCALIGN available on amd64 we can start optimizing some routines that benefit from instruction alignment.

```
                         │    sec/op     │    sec/op     vs base               │
IndexByte/4K-16             69.89n ± ∞ ¹   45.88n ± ∞ ¹  -34.35% (p=0.008 n=5)
IndexByte/4M-16             65.36µ ± ∞ ¹   47.32µ ± ∞ ¹  -27.60% (p=0.008 n=5)
IndexByte/64M-16            1.435m ± ∞ ¹   1.140m ± ∞ ¹  -20.57% (p=0.008 n=5)
                         │      B/s      │      B/s       vs base               │
IndexByte/4K-16            54.58Gi ± ∞ ¹   83.14Gi ± ∞ ¹  +52.32% (p=0.008 n=5)
IndexByte/4M-16            59.76Gi ± ∞ ¹   82.54Gi ± ∞ ¹  +38.12% (p=0.008 n=5)
IndexByte/64M-16           43.56Gi ± ∞ ¹   54.84Gi ± ∞ ¹  +25.89% (p=0.008 n=5)
```

Change-Id: Iff3dfd542c55e7569242be81f38b2887b9e04e87
GitHub-Last-Rev: f309f898b13ad8fdf88a21f2f105382db9ada2f5
GitHub-Pull-Request: golang/go#61792
Reviewed-on: https://go-review.googlesource.com/c/go/+/516435
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/internal/bytealg/index_amd64.s

index 04314917b8918441df9baf13a0e7da0af57c4f1d..31730e539415b6ee1aaa40f4ce98b203c9653dbe 100644 (file)
@@ -39,6 +39,7 @@ no_sse42:
        JA   _3_or_more
        MOVW (R8), R8
        LEAQ -1(DI)(DX*1), DX
+       PCALIGN $16
 loop2:
        MOVW (DI), SI
        CMPW SI,R8
@@ -250,6 +251,7 @@ sse42:
        LEAQ -15(DI)(DX*1), SI
        MOVQ $16, R9
        SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+       PCALIGN $16
 loop_sse42:
        // 0x0c means: unsigned byte compare (bits 0,1 are 00)
        // for equality (bits 2,3 are 11)