From c6a11f0dd279f374602794af60c7cde4585a1e6f Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Tue, 11 Aug 2020 13:04:48 -0700
Subject: [PATCH] crypto,internal/bytealg: fix assembly that clobbers BP
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

BP should be callee-save. It will be saved automatically if
there is a nonzero frame size. Otherwise, we need to avoid this register.

Change-Id: If3f551efa42d830c8793d9f0183cb8daad7a2ab5
Reviewed-on: https://go-review.googlesource.com/c/go/+/248260
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Martin MÃ¶hrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/crypto/elliptic/p256_asm_amd64.s |  5 ++-
 src/crypto/md5/md5block_amd64.s      |  2 +-
 src/internal/bytealg/index_amd64.s   | 52 ++++++++++++++--------------
 src/runtime/sys_linux_amd64.s        |  8 ++---
 4 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/src/crypto/elliptic/p256_asm_amd64.s b/src/crypto/elliptic/p256_asm_amd64.s
index 7afa54a58c..c77b11bcf2 100644
--- a/src/crypto/elliptic/p256_asm_amd64.s
+++ b/src/crypto/elliptic/p256_asm_amd64.s
@@ -1336,7 +1336,7 @@ TEXT p256SubInternal(SB),NOSPLIT,$0
 
 	RET
 /* ---------------------------------------*/
-TEXT p256MulInternal(SB),NOSPLIT,$0
+TEXT p256MulInternal(SB),NOSPLIT,$8
 	MOVQ acc4, mul0
 	MULQ t0
 	MOVQ mul0, acc0
@@ -1519,7 +1519,7 @@ TEXT p256MulInternal(SB),NOSPLIT,$0
 
 	RET
 /* ---------------------------------------*/
-TEXT p256SqrInternal(SB),NOSPLIT,$0
+TEXT p256SqrInternal(SB),NOSPLIT,$8
 
 	MOVQ acc4, mul0
 	MULQ acc5
@@ -2345,4 +2345,3 @@ TEXT Â·p256PointDoubleAsm(SB),NOSPLIT,$256-48
 
 	RET
 /* ---------------------------------------*/
-
diff --git a/src/crypto/md5/md5block_amd64.s b/src/crypto/md5/md5block_amd64.s
index 90d932b146..7c7d92d7e8 100644
--- a/src/crypto/md5/md5block_amd64.s
+++ b/src/crypto/md5/md5block_amd64.s
@@ -13,7 +13,7 @@
 // Licence: I hereby disclaim the copyright on this code and place it
 // in the public domain.
 
-TEXT	Â·block(SB),NOSPLIT,$0-32
+TEXT	Â·block(SB),NOSPLIT,$8-32
 	MOVQ	dig+0(FP),	BP
 	MOVQ	p+8(FP),	SI
 	MOVQ	p_len+16(FP), DX
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
index 4459820801..6193b57239 100644
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@@ -8,7 +8,7 @@
 TEXT Â·Index(SB),NOSPLIT,$0-56
 	MOVQ a_base+0(FP), DI
 	MOVQ a_len+8(FP), DX
-	MOVQ b_base+24(FP), BP
+	MOVQ b_base+24(FP), R8
 	MOVQ b_len+32(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+48(FP), R11
@@ -17,7 +17,7 @@ TEXT Â·Index(SB),NOSPLIT,$0-56
 TEXT Â·IndexString(SB),NOSPLIT,$0-40
 	MOVQ a_base+0(FP), DI
 	MOVQ a_len+8(FP), DX
-	MOVQ b_base+16(FP), BP
+	MOVQ b_base+16(FP), R8
 	MOVQ b_len+24(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+32(FP), R11
@@ -26,7 +26,7 @@ TEXT Â·IndexString(SB),NOSPLIT,$0-40
 // AX: length of string, that we are searching for
 // DX: length of string, in which we are searching
 // DI: pointer to string, in which we are searching
-// BP: pointer to string, that we are searching for
+// R8: pointer to string, that we are searching for
 // R11: address, where to put return value
 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
 TEXT indexbody<>(SB),NOSPLIT,$0
@@ -37,11 +37,11 @@ TEXT indexbody<>(SB),NOSPLIT,$0
 no_sse42:
 	CMPQ AX, $2
 	JA   _3_or_more
-	MOVW (BP), BP
+	MOVW (R8), R8
 	LEAQ -1(DI)(DX*1), DX
 loop2:
 	MOVW (DI), SI
-	CMPW SI,BP
+	CMPW SI,R8
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -50,12 +50,12 @@ loop2:
 _3_or_more:
 	CMPQ AX, $3
 	JA   _4_or_more
-	MOVW 1(BP), BX
-	MOVW (BP), BP
+	MOVW 1(R8), BX
+	MOVW (R8), R8
 	LEAQ -2(DI)(DX*1), DX
 loop3:
 	MOVW (DI), SI
-	CMPW SI,BP
+	CMPW SI,R8
 	JZ   partial_success3
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -72,11 +72,11 @@ partial_success3:
 _4_or_more:
 	CMPQ AX, $4
 	JA   _5_or_more
-	MOVL (BP), BP
+	MOVL (R8), R8
 	LEAQ -3(DI)(DX*1), DX
 loop4:
 	MOVL (DI), SI
-	CMPL SI,BP
+	CMPL SI,R8
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -87,11 +87,11 @@ _5_or_more:
 	JA   _8_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	MOVL -4(BP)(AX*1), BX
-	MOVL (BP), BP
+	MOVL -4(R8)(AX*1), BX
+	MOVL (R8), R8
 loop5to7:
 	MOVL (DI), SI
-	CMPL SI,BP
+	CMPL SI,R8
 	JZ   partial_success5to7
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -108,11 +108,11 @@ partial_success5to7:
 _8_or_more:
 	CMPQ AX, $8
 	JA   _9_or_more
-	MOVQ (BP), BP
+	MOVQ (R8), R8
 	LEAQ -7(DI)(DX*1), DX
 loop8:
 	MOVQ (DI), SI
-	CMPQ SI,BP
+	CMPQ SI,R8
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -123,11 +123,11 @@ _9_or_more:
 	JA   _16_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	MOVQ -8(BP)(AX*1), BX
-	MOVQ (BP), BP
+	MOVQ -8(R8)(AX*1), BX
+	MOVQ (R8), R8
 loop9to15:
 	MOVQ (DI), SI
-	CMPQ SI,BP
+	CMPQ SI,R8
 	JZ   partial_success9to15
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -144,7 +144,7 @@ partial_success9to15:
 _16_or_more:
 	CMPQ AX, $16
 	JA   _17_or_more
-	MOVOU (BP), X1
+	MOVOU (R8), X1
 	LEAQ -15(DI)(DX*1), DX
 loop16:
 	MOVOU (DI), X2
@@ -161,8 +161,8 @@ _17_or_more:
 	JA   _32_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	MOVOU -16(BP)(AX*1), X0
-	MOVOU (BP), X1
+	MOVOU -16(R8)(AX*1), X0
+	MOVOU (R8), X1
 loop17to31:
 	MOVOU (DI), X2
 	PCMPEQB X1,X2
@@ -188,7 +188,7 @@ partial_success17to31:
 _32_or_more:
 	CMPQ AX, $32
 	JA   _33_to_63
-	VMOVDQU (BP), Y1
+	VMOVDQU (R8), Y1
 	LEAQ -31(DI)(DX*1), DX
 loop32:
 	VMOVDQU (DI), Y2
@@ -203,8 +203,8 @@ loop32:
 _33_to_63:
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	VMOVDQU -32(BP)(AX*1), Y0
-	VMOVDQU (BP), Y1
+	VMOVDQU -32(R8)(AX*1), Y0
+	VMOVDQU (R8), Y1
 loop33to63:
 	VMOVDQU (DI), Y2
 	VPCMPEQB Y1, Y2, Y3
@@ -241,10 +241,10 @@ sse42:
 	// This value was determined experimentally and is the ~same
 	// on Nehalem (first with SSE42) and Haswell.
 	JAE _9_or_more
-	LEAQ 16(BP), SI
+	LEAQ 16(R8), SI
 	TESTW $0xff0, SI
 	JEQ no_sse42
-	MOVOU (BP), X1
+	MOVOU (R8), X1
 	LEAQ -15(DI)(DX*1), SI
 	MOVQ $16, R9
 	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index b60057ce83..621c01b365 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -212,7 +212,7 @@ TEXT runtimeÂ·walltime1(SB),NOSPLIT,$16-12
 	// due to stack probes inserted to avoid stack/heap collisions.
 	// See issue #20427.
 
-	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
 
 	get_tls(CX)
 	MOVQ	g(CX), AX
@@ -250,7 +250,7 @@ noswitch:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
 ret:
-	MOVQ	BP, SP		// Restore real SP
+	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
 	// If we are not in a signal handler, we'll restore vdsoSP to 0,
@@ -277,7 +277,7 @@ fallback:
 TEXT runtimeÂ·nanotime1(SB),NOSPLIT,$16-8
 	// Switch to g0 stack. See comment above in runtimeÂ·walltime.
 
-	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
 
 	get_tls(CX)
 	MOVQ	g(CX), AX
@@ -315,7 +315,7 @@ noswitch:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
 ret:
-	MOVQ	BP, SP		// Restore real SP
+	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
 	// If we are not in a signal handler, we'll restore vdsoSP to 0,
-- 
2.50.0