From c6a11f0dd279f374602794af60c7cde4585a1e6f Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 11 Aug 2020 13:04:48 -0700 Subject: [PATCH] crypto,internal/bytealg: fix assembly that clobbers BP MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit BP should be callee-save. It will be saved automatically if there is a nonzero frame size. Otherwise, we need to avoid this register. Change-Id: If3f551efa42d830c8793d9f0183cb8daad7a2ab5 Reviewed-on: https://go-review.googlesource.com/c/go/+/248260 Run-TryBot: Keith Randall Reviewed-by: Michael Knyszek Reviewed-by: Martin Möhrmann TryBot-Result: Gobot Gobot --- src/crypto/elliptic/p256_asm_amd64.s | 5 ++- src/crypto/md5/md5block_amd64.s | 2 +- src/internal/bytealg/index_amd64.s | 52 ++++++++++++++-------------- src/runtime/sys_linux_amd64.s | 8 ++--- 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/src/crypto/elliptic/p256_asm_amd64.s b/src/crypto/elliptic/p256_asm_amd64.s index 7afa54a58c..c77b11bcf2 100644 --- a/src/crypto/elliptic/p256_asm_amd64.s +++ b/src/crypto/elliptic/p256_asm_amd64.s @@ -1336,7 +1336,7 @@ TEXT p256SubInternal(SB),NOSPLIT,$0 RET /* ---------------------------------------*/ -TEXT p256MulInternal(SB),NOSPLIT,$0 +TEXT p256MulInternal(SB),NOSPLIT,$8 MOVQ acc4, mul0 MULQ t0 MOVQ mul0, acc0 @@ -1519,7 +1519,7 @@ TEXT p256MulInternal(SB),NOSPLIT,$0 RET /* ---------------------------------------*/ -TEXT p256SqrInternal(SB),NOSPLIT,$0 +TEXT p256SqrInternal(SB),NOSPLIT,$8 MOVQ acc4, mul0 MULQ acc5 @@ -2345,4 +2345,3 @@ TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 RET /* ---------------------------------------*/ - diff --git a/src/crypto/md5/md5block_amd64.s b/src/crypto/md5/md5block_amd64.s index 90d932b146..7c7d92d7e8 100644 --- a/src/crypto/md5/md5block_amd64.s +++ b/src/crypto/md5/md5block_amd64.s @@ -13,7 +13,7 @@ // Licence: I hereby disclaim the copyright on this code and place it // in the public domain. -TEXT ·block(SB),NOSPLIT,$0-32 +TEXT ·block(SB),NOSPLIT,$8-32 MOVQ dig+0(FP), BP MOVQ p+8(FP), SI MOVQ p_len+16(FP), DX diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s index 4459820801..6193b57239 100644 --- a/src/internal/bytealg/index_amd64.s +++ b/src/internal/bytealg/index_amd64.s @@ -8,7 +8,7 @@ TEXT ·Index(SB),NOSPLIT,$0-56 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX - MOVQ b_base+24(FP), BP + MOVQ b_base+24(FP), R8 MOVQ b_len+32(FP), AX MOVQ DI, R10 LEAQ ret+48(FP), R11 @@ -17,7 +17,7 @@ TEXT ·Index(SB),NOSPLIT,$0-56 TEXT ·IndexString(SB),NOSPLIT,$0-40 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX - MOVQ b_base+16(FP), BP + MOVQ b_base+16(FP), R8 MOVQ b_len+24(FP), AX MOVQ DI, R10 LEAQ ret+32(FP), R11 @@ -26,7 +26,7 @@ TEXT ·IndexString(SB),NOSPLIT,$0-40 // AX: length of string, that we are searching for // DX: length of string, in which we are searching // DI: pointer to string, in which we are searching -// BP: pointer to string, that we are searching for +// R8: pointer to string, that we are searching for // R11: address, where to put return value // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them TEXT indexbody<>(SB),NOSPLIT,$0 @@ -37,11 +37,11 @@ TEXT indexbody<>(SB),NOSPLIT,$0 no_sse42: CMPQ AX, $2 JA _3_or_more - MOVW (BP), BP + MOVW (R8), R8 LEAQ -1(DI)(DX*1), DX loop2: MOVW (DI), SI - CMPW SI,BP + CMPW SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX @@ -50,12 +50,12 @@ loop2: _3_or_more: CMPQ AX, $3 JA _4_or_more - MOVW 1(BP), BX - MOVW (BP), BP + MOVW 1(R8), BX + MOVW (R8), R8 LEAQ -2(DI)(DX*1), DX loop3: MOVW (DI), SI - CMPW SI,BP + CMPW SI,R8 JZ partial_success3 ADDQ $1,DI CMPQ DI,DX @@ -72,11 +72,11 @@ partial_success3: _4_or_more: CMPQ AX, $4 JA _5_or_more - MOVL (BP), BP + MOVL (R8), R8 LEAQ -3(DI)(DX*1), DX loop4: MOVL (DI), SI - CMPL SI,BP + CMPL SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX @@ -87,11 +87,11 @@ _5_or_more: JA _8_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - MOVL -4(BP)(AX*1), BX - MOVL (BP), BP + MOVL -4(R8)(AX*1), BX + MOVL (R8), R8 loop5to7: MOVL (DI), SI - CMPL SI,BP + CMPL SI,R8 JZ partial_success5to7 ADDQ $1,DI CMPQ DI,DX @@ -108,11 +108,11 @@ partial_success5to7: _8_or_more: CMPQ AX, $8 JA _9_or_more - MOVQ (BP), BP + MOVQ (R8), R8 LEAQ -7(DI)(DX*1), DX loop8: MOVQ (DI), SI - CMPQ SI,BP + CMPQ SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX @@ -123,11 +123,11 @@ _9_or_more: JA _16_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - MOVQ -8(BP)(AX*1), BX - MOVQ (BP), BP + MOVQ -8(R8)(AX*1), BX + MOVQ (R8), R8 loop9to15: MOVQ (DI), SI - CMPQ SI,BP + CMPQ SI,R8 JZ partial_success9to15 ADDQ $1,DI CMPQ DI,DX @@ -144,7 +144,7 @@ partial_success9to15: _16_or_more: CMPQ AX, $16 JA _17_or_more - MOVOU (BP), X1 + MOVOU (R8), X1 LEAQ -15(DI)(DX*1), DX loop16: MOVOU (DI), X2 @@ -161,8 +161,8 @@ _17_or_more: JA _32_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - MOVOU -16(BP)(AX*1), X0 - MOVOU (BP), X1 + MOVOU -16(R8)(AX*1), X0 + MOVOU (R8), X1 loop17to31: MOVOU (DI), X2 PCMPEQB X1,X2 @@ -188,7 +188,7 @@ partial_success17to31: _32_or_more: CMPQ AX, $32 JA _33_to_63 - VMOVDQU (BP), Y1 + VMOVDQU (R8), Y1 LEAQ -31(DI)(DX*1), DX loop32: VMOVDQU (DI), Y2 @@ -203,8 +203,8 @@ loop32: _33_to_63: LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - VMOVDQU -32(BP)(AX*1), Y0 - VMOVDQU (BP), Y1 + VMOVDQU -32(R8)(AX*1), Y0 + VMOVDQU (R8), Y1 loop33to63: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 @@ -241,10 +241,10 @@ sse42: // This value was determined experimentally and is the ~same // on Nehalem (first with SSE42) and Haswell. JAE _9_or_more - LEAQ 16(BP), SI + LEAQ 16(R8), SI TESTW $0xff0, SI JEQ no_sse42 - MOVOU (BP), X1 + MOVOU (R8), X1 LEAQ -15(DI)(DX*1), SI MOVQ $16, R9 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index b60057ce83..621c01b365 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -212,7 +212,7 @@ TEXT runtime·walltime1(SB),NOSPLIT,$16-12 // due to stack probes inserted to avoid stack/heap collisions. // See issue #20427. - MOVQ SP, BP // Save old SP; BP unchanged by C code. + MOVQ SP, R12 // Save old SP; R12 unchanged by C code. get_tls(CX) MOVQ g(CX), AX @@ -250,7 +250,7 @@ noswitch: MOVQ 0(SP), AX // sec MOVQ 8(SP), DX // nsec ret: - MOVQ BP, SP // Restore real SP + MOVQ R12, SP // Restore real SP // Restore vdsoPC, vdsoSP // We don't worry about being signaled between the two stores. // If we are not in a signal handler, we'll restore vdsoSP to 0, @@ -277,7 +277,7 @@ fallback: TEXT runtime·nanotime1(SB),NOSPLIT,$16-8 // Switch to g0 stack. See comment above in runtime·walltime. - MOVQ SP, BP // Save old SP; BP unchanged by C code. + MOVQ SP, R12 // Save old SP; R12 unchanged by C code. get_tls(CX) MOVQ g(CX), AX @@ -315,7 +315,7 @@ noswitch: MOVQ 0(SP), AX // sec MOVQ 8(SP), DX // nsec ret: - MOVQ BP, SP // Restore real SP + MOVQ R12, SP // Restore real SP // Restore vdsoPC, vdsoSP // We don't worry about being signaled between the two stores. // If we are not in a signal handler, we'll restore vdsoSP to 0, -- 2.50.0