From 56c4422770c076b86c6c66d8d96bed31b66aaa28 Mon Sep 17 00:00:00 2001 From: Archana R Date: Thu, 23 Mar 2023 12:35:39 -0500 Subject: [PATCH] runtime: improve index on ppc64x/power10 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Rewrite index asm function to use the new power10 instruction lxvl, stxvl or the load, store vector with length which can specify the number of bytes to be stored in a register. This avoids the need to create a separator mask and extra AND instructions. It also allows us to process the tail end of the string using a lot fewer instructions as we can load bytes of separator length directly rather than loading 16 bytes and masking out bytes that are greater than separator length On power9 and power8 the code remains unchanged. The performance for smaller sizes improve the most, on larger sizes we see minimal improvement. name old time/op new time/op delta Index/10 10.6ns ± 3% 9.8ns ± 2% -7.20% Index/11 11.2ns ± 4% 10.6ns ± 0% -5.99% Index/12 12.7ns ± 3% 11.3ns ± 0% -11.21% Index/13 13.5ns ± 2% 11.7ns ± 0% -13.11% Index/14 14.1ns ± 1% 12.0ns ± 0% -14.43% Index/15 14.3ns ± 2% 12.4ns ± 0% -13.39% Index/16 14.5ns ± 1% 12.7ns ± 0% -12.57% Index/17 26.7ns ± 0% 25.9ns ± 0% -2.99% Index/18 27.3ns ± 0% 26.4ns ± 1% -3.35% Index/19 35.7ns ±16% 26.1ns ± 1% -26.87% Index/20 29.4ns ± 0% 27.3ns ± 1% -7.06% Index/21 29.3ns ± 0% 26.9ns ± 1% -8.37% Index/22 30.0ns ± 0% 27.4ns ± 0% -8.68% Index/23 29.9ns ± 0% 27.7ns ± 0% -7.15% Index/24 31.0ns ± 0% 28.0ns ± 0% -9.92% Index/25 31.7ns ± 0% 28.4ns ± 0% -10.54% Index/26 30.6ns ± 0% 28.9ns ± 1% -5.67% Index/27 31.4ns ± 0% 29.3ns ± 0% -6.71% Index/28 32.7ns ± 0% 29.6ns ± 1% -9.36% Index/29 33.3ns ± 0% 30.1ns ± 1% -9.70% Index/30 32.4ns ± 0% 30.7ns ± 0% -5.23% Index/31 33.2ns ± 0% 30.6ns ± 1% -7.83% Index/32 34.3ns ± 0% 30.9ns ± 0% -9.94% Index/64 46.8ns ± 0% 44.2ns ± 0% -5.66% Index/128 71.2ns ± 0% 67.3ns ± 0% -5.43% Index/256 129ns ± 0% 127ns ± 0% -1.67% Index/2K 838ns ± 0% 804ns ± 0% -4.03% Index/4K 1.65µs ± 0% 1.58µs ± 0% -4.25% Index/2M 829µs ± 0% 793µs ± 0% -4.42% Index/4M 1.65ms ± 0% 1.59ms ± 0% -4.19% Index/64M 26.5ms ± 0% 25.4ms ± 0% -4.18% IndexHard2 412µs ± 0% 396µs ± 0% -3.76% IndexEasy/10 10.0ns ± 0% 9.3ns ± 1% -7.20% IndexEasy/11 10.8ns ± 1% 11.0ns ± 1% +2.22% IndexEasy/12 12.3ns ± 2% 11.5ns ± 1% -6.37% IndexEasy/13 13.1ns ± 0% 11.7ns ± 2% -10.83% IndexEasy/14 13.8ns ± 2% 11.9ns ± 1% -13.52% IndexEasy/15 14.0ns ± 2% 12.4ns ± 2% -11.46% IndexEasy/16 14.3ns ± 1% 12.5ns ± 0% -12.40% CountHard2 415µs ± 0% 396µs ± 0% -4.48% Change-Id: Id3efa5ed9c662a29f58125c7f866a09f29a59b6c Reviewed-on: https://go-review.googlesource.com/c/go/+/478918 Reviewed-by: Dmitri Shuralyov Reviewed-by: Paul Murphy Run-TryBot: Archana Ravindar Reviewed-by: David Chase TryBot-Result: Gopher Robot Reviewed-by: Lynn Boger --- src/internal/bytealg/index_ppc64x.s | 63 +++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s index 26205cebaf..e98f96b715 100644 --- a/src/internal/bytealg/index_ppc64x.s +++ b/src/internal/bytealg/index_ppc64x.s @@ -434,41 +434,49 @@ TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0 CMP R6, $0 // Check sep len BEQ notfound // sep len 0 -- not found MOVD R3, R7 // Copy of string addr +#ifndef GOPPC64_power10 MOVD $16, R16 // Index value 16 MOVD $17, R17 // Index value 17 MOVD $18, R18 // Index value 18 - MOVD $1, R19 // Index value 1 VSPLTISB $0xFF, ONES // splat all 1s - - CMP R6, $16, CR4 // CR4 for len(sep) >= 16 VOR ONES, ONES, SEPMASK // Set up full SEPMASK +#else + SLD $56, R6, R14 // Set up separator length for LXVLL +#endif + MOVD $1, R19 // Index value 1 + CMP R6, $16, CR4 // CR4 for len(sep) >= 16 BGE CR4, loadge16 // Load for len(sep) >= 16 +#ifndef GOPPC64_power10 SUB R6, R16, R9 // 16-len of sep SLD $3, R9 // Set up for VSLO MTVSRD R9, V9 // Set up for VSLO VSLDOI $8, V9, V9, V9 // Set up for VSLO VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 - +#endif loadge16: ANDCC $15, R5, R9 // Find byte offset of sep ADD R9, R6, R10 // Add sep len CMP R10, $16 // Check if sep len+offset > 16 BGT sepcross16 // Sep crosses 16 byte boundary - +#ifdef GOPPC64_power10 + LXVLL R5, R14, V0 // Load separator +#else RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0 SLD $3, R9 // Set up shift count for VSLO MTVSRD R9, V8 // Set up shift count for VSLO VSLDOI $8, V8, V8, V8 VSLO V0, V8, V0 // Shift by start byte - VAND V0, SEPMASK, V0 // Mask separator (< 16) - BR index2plus - +#endif + BR index2plus sepcross16: - LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0 - +#ifdef GOPPC64_power10 + LXVLL R5, R14, V0 // Load separator +#else + LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\ VAND V0, SEPMASK, V0 // mask out separator +#endif BLE CR4, index2to16 BR index17plus // Handle sep > 16 @@ -659,9 +667,23 @@ index2to16: VSPLTISB $0, V10 // Clear BGT index2to16tail - MOVD $3, R17 // Number of bytes beyond 16 +#ifdef GOPPC64_power10 + ADD $3,R7, R17 // Base+3 + ADD $2,R7, R8 // Base+2 + ADD $1,R7, R10 // Base+1 +#else + MOVD $3, R17 // Number of bytes beyond 16 +#endif PCALIGN $32 + index2to16loop: + +#ifdef GOPPC64_power10 + LXVLL R7, R14, V8 // Load next 16 bytes of string from Base + LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1 + LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2 + LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3 +#else LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7 LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3 @@ -672,6 +694,7 @@ index2to16loop: VAND V3, SEPMASK, V9 // Mask out sep size 1st index VAND V4, SEPMASK, V11 // Mask out sep size 2nd index VAND V5, SEPMASK, V12 // Mask out sep size 3rd index +#endif VCMPEQUBCC V0, V8, V8 // compare masked string BLT CR6, found // All equal while comparing 0th index VCMPEQUBCC V0, V9, V9 // compare masked string @@ -682,14 +705,29 @@ index2to16loop: BLT CR6, found4 // All equal while comparing 3rd index ADD $4, R7 // Update ptr to next 4 bytes +#ifdef GOPPC64_power10 + ADD $4, R17 // Update ptr to next 4 bytes + ADD $4, R8 // Update ptr to next 4 bytes + ADD $4, R10 // Update ptr to next 4 bytes +#endif CMP R7, LASTSTR // Still less than last start byte BGT notfound // Not found ADD $19, R7, R9 // Verify remaining bytes CMP R9, LASTBYTE // length of string at least 19 BLE index2to16loop // Try again, else do post processing and jump to index2to16next - + PCALIGN $32 // <19 bytes left, post process the remaining string index2to16tail: +#ifdef GOPPC64_power10 +index2to16next_p10: + LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1 + VCMPEQUBCC V1, V0, V3 // Compare sep and partial string + BLT CR6, found // Found + ADD $1, R7 // Not found, try next partial string + CMP R7, LASTSTR // Check for end of string + BLE index2to16next_p10 // If at end, then not found + BR notfound // go to remainder loop +#else ADD R3, R4, R9 // End of string SUB R7, R9, R9 // Number of bytes left ANDCC $15, R7, R10 // 16 byte offset @@ -748,6 +786,7 @@ index2to16next: BGT notfound // If at end, then not found VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte BR index2to16next // Check the next partial string +#endif // Tail processing if GOPPC64!=power10 index17plus: CMP R6, $32 // Check if 17 < len(sep) <= 32 -- 2.48.1