//
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
MOVD b+0(FP), R0
- MOVD b_len+8(FP), R1
- MOVBU c+24(FP), R2 // byte to find
- MOVD R0, R4 // store base for later
- ADD R0, R1 // end
-loop:
- CMP R0, R1
- BEQ notfound
- MOVBU.P 1(R0), R3
- CMP R2, R3
- BNE loop
-
- SUB $1, R0 // R0 will be one beyond the position we want
- SUB R4, R0 // remove base
- MOVD R0, ret+32(FP)
- RET
-
-notfound:
- MOVD $-1, R0
- MOVD R0, ret+32(FP)
- RET
+ MOVD b_len+8(FP), R2
+ MOVBU c+24(FP), R1
+ MOVD $ret+32(FP), R8
+ B runtime·indexbytebody<>(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-32
MOVD s+0(FP), R0
- MOVD s_len+8(FP), R1
- MOVBU c+16(FP), R2 // byte to find
- MOVD R0, R4 // store base for later
- ADD R0, R1 // end
+ MOVD s_len+8(FP), R2
+ MOVBU c+16(FP), R1
+ MOVD $ret+24(FP), R8
+ B runtime·indexbytebody<>(SB)
+
+// input:
+// R0: data
+// R1: byte to search
+// R2: data len
+// R8: address to put result
+TEXT runtime·indexbytebody<>(SB),NOSPLIT,$0
+ // Core algorithm:
+ // For each 32-byte chunk we calculate a 64-bit syndrome value,
+ // with two bits per byte. For each tuple, bit 0 is set if the
+ // relevant byte matched the requested character and bit 1 is
+ // not used (faster than using a 32bit syndrome). Since the bits
+ // in the syndrome reflect exactly the order in which things occur
+ // in the original string, counting trailing zeros allows to
+ // identify exactly which byte has matched.
+
+ CBZ R2, fail
+ MOVD R0, R11
+ // Magic constant 0x40100401 allows us to identify
+ // which lane matches the requested byte.
+ // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
+ // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
+ MOVD $0x40100401, R5
+ VMOV R1, V0.B16
+ // Work with aligned 32-byte chunks
+ BIC $0x1f, R0, R3
+ VMOV R5, V5.S4
+ ANDS $0x1f, R0, R9
+ AND $0x1f, R2, R10
+ BEQ loop
+
+ // Input string is not 32-byte aligned. We calculate the
+ // syndrome value for the aligned 32 bytes block containing
+ // the first bytes and mask off the irrelevant part.
+ VLD1.P (R3), [V1.B16, V2.B16]
+ SUB $0x20, R9, R4
+ ADDS R4, R2, R2
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ VADDP V4.B16, V3.B16, V6.B16 // 256->128
+ VADDP V6.B16, V6.B16, V6.B16 // 128->64
+ VMOV V6.D[0], R6
+ // Clear the irrelevant lower bits
+ LSL $1, R9, R4
+ LSR R4, R6, R6
+ LSL R4, R6, R6
+ // The first block can also be the last
+ BLS masklast
+ // Have we found something already?
+ CBNZ R6, tail
+
loop:
- CMP R0, R1
- BEQ notfound
- MOVBU.P 1(R0), R3
- CMP R2, R3
- BNE loop
+ VLD1.P (R3), [V1.B16, V2.B16]
+ SUBS $0x20, R2, R2
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ // If we're out of data we finish regardless of the result
+ BLS end
+ // Use a fast check for the termination condition
+ VORR V4.B16, V3.B16, V6.B16
+ VADDP V6.D2, V6.D2, V6.D2
+ VMOV V6.D[0], R6
+ // We're not out of data, loop if we haven't found the character
+ CBZ R6, loop
+
+end:
+ // Termination condition found, let's calculate the syndrome value
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ VADDP V4.B16, V3.B16, V6.B16
+ VADDP V6.B16, V6.B16, V6.B16
+ VMOV V6.D[0], R6
+ // Only do the clear for the last possible block with less than 32 bytes
+ // Condition flags come from SUBS in the loop
+ BHS tail
+
+masklast:
+ // Clear the irrelevant upper bits
+ ADD R9, R10, R4
+ AND $0x1f, R4, R4
+ SUB $0x20, R4, R4
+ NEG R4<<1, R4
+ LSL R4, R6, R6
+ LSR R4, R6, R6
- SUB $1, R0 // R0 will be one beyond the position we want
- SUB R4, R0 // remove base
- MOVD R0, ret+24(FP)
+tail:
+ // Check that we have found a character
+ CBZ R6, fail
+ // Count the trailing zeros using bit reversing
+ RBIT R6, R6
+ // Compensate the last post-increment
+ SUB $0x20, R3, R3
+ // And count the leading zeros
+ CLZ R6, R6
+ // R6 is twice the offset into the fragment
+ ADD R6>>1, R3, R0
+ // Compute the offset result
+ SUB R11, R0, R0
+ MOVD R0, (R8)
RET
-notfound:
+fail:
MOVD $-1, R0
- MOVD R0, ret+24(FP)
+ MOVD R0, (R8)
RET
// Equal(a, b []byte) bool
// void runtime·memclrNoHeapPointers(void*, uintptr)
TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16
- MOVD ptr+0(FP), R3
- MOVD n+8(FP), R4
- // TODO(mwhudson): this is written this way to avoid tickling
- // warnings from addpool when written as AND $7, R4, R6 (see
- // https://golang.org/issue/12708)
- AND $~7, R4, R5 // R5 is N&~7
- SUB R5, R4, R6 // R6 is N&7
-
- CMP $0, R5
- BEQ nowords
-
- ADD R3, R5, R5
-
-wordloop: // TODO: Optimize for unaligned ptr.
- MOVD.P $0, 8(R3)
- CMP R3, R5
- BNE wordloop
-nowords:
- CMP $0, R6
- BEQ done
-
- ADD R3, R6, R6
-
-byteloop:
- MOVBU.P $0, 1(R3)
- CMP R3, R6
- BNE byteloop
-done:
+ MOVD ptr+0(FP), R0
+ MOVD n+8(FP), R1
+ // If size is less than 16 bytes, use tail_zero to zero what remains
+ CMP $16, R1
+ BLT tail_zero
+ // Get buffer offset into 16 byte aligned address for better performance
+ ANDS $15, R0, ZR
+ BNE unaligned_to_16
+aligned_to_16:
+ LSR $4, R1, R2
+zero_by_16:
+ STP.P (ZR, ZR), 16(R0)
+ SUBS $1, R2, R2
+ BNE zero_by_16
+
+ ANDS $15, R1, R1
+ BEQ ending
+
+ // Zero buffer with size=R1 < 16
+tail_zero:
+ TBZ $3, R1, tail_zero_4
+ MOVD.P ZR, 8(R0)
+
+tail_zero_4:
+ TBZ $2, R1, tail_zero_2
+ MOVW.P ZR, 4(R0)
+
+tail_zero_2:
+ TBZ $1, R1, tail_zero_1
+ MOVH.P ZR, 2(R0)
+
+tail_zero_1:
+ TBZ $0, R1, ending
+ MOVB ZR, (R0)
+
+ending:
RET
+
+unaligned_to_16:
+ MOVD R0, R2
+head_loop:
+ MOVBU.P ZR, 1(R0)
+ ANDS $15, R0, ZR
+ BNE head_loop
+ // Adjust length for what remains
+ SUB R2, R0, R3
+ SUB R3, R1
+ // If size is less than 16 bytes, use tail_zero to zero what remains
+ CMP $16, R1
+ BLT tail_zero
+ B aligned_to_16