|
@@ -0,0 +1,722 @@
|
|
|
+// Copyright 2020 The Go Authors. All rights reserved.
|
|
|
+// Use of this source code is governed by a BSD-style
|
|
|
+// license that can be found in the LICENSE file.
|
|
|
+
|
|
|
+// +build !appengine
|
|
|
+// +build gc
|
|
|
+// +build !noasm
|
|
|
+
|
|
|
+#include "textflag.h"
|
|
|
+
|
|
|
+// The asm code generally follows the pure Go code in encode_other.go, except
|
|
|
+// where marked with a "!!!".
|
|
|
+
|
|
|
+// ----------------------------------------------------------------------------
|
|
|
+
|
|
|
+// func emitLiteral(dst, lit []byte) int
|
|
|
+//
|
|
|
+// All local variables fit into registers. The register allocation:
|
|
|
+// - R3 len(lit)
|
|
|
+// - R4 n
|
|
|
+// - R6 return value
|
|
|
+// - R8 &dst[i]
|
|
|
+// - R10 &lit[0]
|
|
|
+//
|
|
|
+// The 32 bytes of stack space is to call runtime·memmove.
|
|
|
+//
|
|
|
+// The unusual register allocation of local variables, such as R10 for the
|
|
|
+// source pointer, matches the allocation used at the call site in encodeBlock,
|
|
|
+// which makes it easier to manually inline this function.
|
|
|
+TEXT ·emitLiteral(SB), NOSPLIT, $32-56
|
|
|
+ MOVD dst_base+0(FP), R8
|
|
|
+ MOVD lit_base+24(FP), R10
|
|
|
+ MOVD lit_len+32(FP), R3
|
|
|
+ MOVD R3, R6
|
|
|
+ MOVW R3, R4
|
|
|
+ SUBW $1, R4, R4
|
|
|
+
|
|
|
+ CMPW $60, R4
|
|
|
+ BLT oneByte
|
|
|
+ CMPW $256, R4
|
|
|
+ BLT twoBytes
|
|
|
+
|
|
|
+threeBytes:
|
|
|
+ MOVD $0xf4, R2
|
|
|
+ MOVB R2, 0(R8)
|
|
|
+ MOVW R4, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+ ADD $3, R6, R6
|
|
|
+ B memmove
|
|
|
+
|
|
|
+twoBytes:
|
|
|
+ MOVD $0xf0, R2
|
|
|
+ MOVB R2, 0(R8)
|
|
|
+ MOVB R4, 1(R8)
|
|
|
+ ADD $2, R8, R8
|
|
|
+ ADD $2, R6, R6
|
|
|
+ B memmove
|
|
|
+
|
|
|
+oneByte:
|
|
|
+ LSLW $2, R4, R4
|
|
|
+ MOVB R4, 0(R8)
|
|
|
+ ADD $1, R8, R8
|
|
|
+ ADD $1, R6, R6
|
|
|
+
|
|
|
+memmove:
|
|
|
+ MOVD R6, ret+48(FP)
|
|
|
+
|
|
|
+ // copy(dst[i:], lit)
|
|
|
+ //
|
|
|
+ // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
|
|
|
+ // R8, R10 and R3 as arguments.
|
|
|
+ MOVD R8, 8(RSP)
|
|
|
+ MOVD R10, 16(RSP)
|
|
|
+ MOVD R3, 24(RSP)
|
|
|
+ CALL runtime·memmove(SB)
|
|
|
+ RET
|
|
|
+
|
|
|
+// ----------------------------------------------------------------------------
|
|
|
+
|
|
|
+// func emitCopy(dst []byte, offset, length int) int
|
|
|
+//
|
|
|
+// All local variables fit into registers. The register allocation:
|
|
|
+// - R3 length
|
|
|
+// - R7 &dst[0]
|
|
|
+// - R8 &dst[i]
|
|
|
+// - R11 offset
|
|
|
+//
|
|
|
+// The unusual register allocation of local variables, such as R11 for the
|
|
|
+// offset, matches the allocation used at the call site in encodeBlock, which
|
|
|
+// makes it easier to manually inline this function.
|
|
|
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
|
|
|
+ MOVD dst_base+0(FP), R8
|
|
|
+ MOVD R8, R7
|
|
|
+ MOVD offset+24(FP), R11
|
|
|
+ MOVD length+32(FP), R3
|
|
|
+
|
|
|
+loop0:
|
|
|
+ // for length >= 68 { etc }
|
|
|
+ CMPW $68, R3
|
|
|
+ BLT step1
|
|
|
+
|
|
|
+ // Emit a length 64 copy, encoded as 3 bytes.
|
|
|
+ MOVD $0xfe, R2
|
|
|
+ MOVB R2, 0(R8)
|
|
|
+ MOVW R11, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+ SUB $64, R3, R3
|
|
|
+ B loop0
|
|
|
+
|
|
|
+step1:
|
|
|
+ // if length > 64 { etc }
|
|
|
+ CMP $64, R3
|
|
|
+ BLE step2
|
|
|
+
|
|
|
+ // Emit a length 60 copy, encoded as 3 bytes.
|
|
|
+ MOVD $0xee, R2
|
|
|
+ MOVB R2, 0(R8)
|
|
|
+ MOVW R11, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+ SUB $60, R3, R3
|
|
|
+
|
|
|
+step2:
|
|
|
+ // if length >= 12 || offset >= 2048 { goto step3 }
|
|
|
+ CMP $12, R3
|
|
|
+ BGE step3
|
|
|
+ CMPW $2048, R11
|
|
|
+ BGE step3
|
|
|
+
|
|
|
+ // Emit the remaining copy, encoded as 2 bytes.
|
|
|
+ MOVB R11, 1(R8)
|
|
|
+ LSRW $3, R11, R11
|
|
|
+ AND $0xe0, R11, R11
|
|
|
+ SUB $4, R3, R3
|
|
|
+ LSLW $2, R3
|
|
|
+ AND $0xff, R3, R3
|
|
|
+ ORRW R3, R11, R11
|
|
|
+ ORRW $1, R11, R11
|
|
|
+ MOVB R11, 0(R8)
|
|
|
+ ADD $2, R8, R8
|
|
|
+
|
|
|
+ // Return the number of bytes written.
|
|
|
+ SUB R7, R8, R8
|
|
|
+ MOVD R8, ret+40(FP)
|
|
|
+ RET
|
|
|
+
|
|
|
+step3:
|
|
|
+ // Emit the remaining copy, encoded as 3 bytes.
|
|
|
+ SUB $1, R3, R3
|
|
|
+ AND $0xff, R3, R3
|
|
|
+ LSLW $2, R3, R3
|
|
|
+ ORRW $2, R3, R3
|
|
|
+ MOVB R3, 0(R8)
|
|
|
+ MOVW R11, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+
|
|
|
+ // Return the number of bytes written.
|
|
|
+ SUB R7, R8, R8
|
|
|
+ MOVD R8, ret+40(FP)
|
|
|
+ RET
|
|
|
+
|
|
|
+// ----------------------------------------------------------------------------
|
|
|
+
|
|
|
+// func extendMatch(src []byte, i, j int) int
|
|
|
+//
|
|
|
+// All local variables fit into registers. The register allocation:
|
|
|
+// - R6 &src[0]
|
|
|
+// - R7 &src[j]
|
|
|
+// - R13 &src[len(src) - 8]
|
|
|
+// - R14 &src[len(src)]
|
|
|
+// - R15 &src[i]
|
|
|
+//
|
|
|
+// The unusual register allocation of local variables, such as R15 for a source
|
|
|
+// pointer, matches the allocation used at the call site in encodeBlock, which
|
|
|
+// makes it easier to manually inline this function.
|
|
|
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
|
|
|
+ MOVD src_base+0(FP), R6
|
|
|
+ MOVD src_len+8(FP), R14
|
|
|
+ MOVD i+24(FP), R15
|
|
|
+ MOVD j+32(FP), R7
|
|
|
+ ADD R6, R14, R14
|
|
|
+ ADD R6, R15, R15
|
|
|
+ ADD R6, R7, R7
|
|
|
+ MOVD R14, R13
|
|
|
+ SUB $8, R13, R13
|
|
|
+
|
|
|
+cmp8:
|
|
|
+ // As long as we are 8 or more bytes before the end of src, we can load and
|
|
|
+ // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
|
|
|
+ CMP R13, R7
|
|
|
+ BHI cmp1
|
|
|
+ MOVD (R15), R3
|
|
|
+ MOVD (R7), R4
|
|
|
+ CMP R4, R3
|
|
|
+ BNE bsf
|
|
|
+ ADD $8, R15, R15
|
|
|
+ ADD $8, R7, R7
|
|
|
+ B cmp8
|
|
|
+
|
|
|
+bsf:
|
|
|
+ // If those 8 bytes were not equal, XOR the two 8 byte values, and return
|
|
|
+ // the index of the first byte that differs.
|
|
|
+ // RBIT reverses the bit order, then CLZ counts the leading zeros, the
|
|
|
+ // combination of which finds the least significant bit which is set.
|
|
|
+ // The arm64 architecture is little-endian, and the shift by 3 converts
|
|
|
+ // a bit index to a byte index.
|
|
|
+ EOR R3, R4, R4
|
|
|
+ RBIT R4, R4
|
|
|
+ CLZ R4, R4
|
|
|
+ ADD R4>>3, R7, R7
|
|
|
+
|
|
|
+ // Convert from &src[ret] to ret.
|
|
|
+ SUB R6, R7, R7
|
|
|
+ MOVD R7, ret+40(FP)
|
|
|
+ RET
|
|
|
+
|
|
|
+cmp1:
|
|
|
+ // In src's tail, compare 1 byte at a time.
|
|
|
+ CMP R7, R14
|
|
|
+ BLS extendMatchEnd
|
|
|
+ MOVB (R15), R3
|
|
|
+ MOVB (R7), R4
|
|
|
+ CMP R4, R3
|
|
|
+ BNE extendMatchEnd
|
|
|
+ ADD $1, R15, R15
|
|
|
+ ADD $1, R7, R7
|
|
|
+ B cmp1
|
|
|
+
|
|
|
+extendMatchEnd:
|
|
|
+ // Convert from &src[ret] to ret.
|
|
|
+ SUB R6, R7, R7
|
|
|
+ MOVD R7, ret+40(FP)
|
|
|
+ RET
|
|
|
+
|
|
|
+// ----------------------------------------------------------------------------
|
|
|
+
|
|
|
+// func encodeBlock(dst, src []byte) (d int)
|
|
|
+//
|
|
|
+// All local variables fit into registers, other than "var table". The register
|
|
|
+// allocation:
|
|
|
+// - R3 . .
|
|
|
+// - R4 . .
|
|
|
+// - R5 64 shift
|
|
|
+// - R6 72 &src[0], tableSize
|
|
|
+// - R7 80 &src[s]
|
|
|
+// - R8 88 &dst[d]
|
|
|
+// - R9 96 sLimit
|
|
|
+// - R10 . &src[nextEmit]
|
|
|
+// - R11 104 prevHash, currHash, nextHash, offset
|
|
|
+// - R12 112 &src[base], skip
|
|
|
+// - R13 . &src[nextS], &src[len(src) - 8]
|
|
|
+// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
|
|
|
+// - R15 120 candidate
|
|
|
+// - R16 . hash constant, 0x1e35a7bd
|
|
|
+// - R17 . &table
|
|
|
+// - . 128 table
|
|
|
+//
|
|
|
+// The second column (64, 72, etc) is the stack offset to spill the registers
|
|
|
+// when calling other functions. We could pack this slightly tighter, but it's
|
|
|
+// simpler to have a dedicated spill map independent of the function called.
|
|
|
+//
|
|
|
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
|
|
|
+// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
|
|
|
+// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
|
|
|
+TEXT ·encodeBlock(SB), 0, $32896-56
|
|
|
+ MOVD dst_base+0(FP), R8
|
|
|
+ MOVD src_base+24(FP), R7
|
|
|
+ MOVD src_len+32(FP), R14
|
|
|
+
|
|
|
+ // shift, tableSize := uint32(32-8), 1<<8
|
|
|
+ MOVD $24, R5
|
|
|
+ MOVD $256, R6
|
|
|
+ MOVW $0xa7bd, R16
|
|
|
+ MOVKW $(0x1e35<<16), R16
|
|
|
+
|
|
|
+calcShift:
|
|
|
+ // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
|
|
|
+ // shift--
|
|
|
+ // }
|
|
|
+ MOVD $16384, R2
|
|
|
+ CMP R2, R6
|
|
|
+ BGE varTable
|
|
|
+ CMP R14, R6
|
|
|
+ BGE varTable
|
|
|
+ SUB $1, R5, R5
|
|
|
+ LSL $1, R6, R6
|
|
|
+ B calcShift
|
|
|
+
|
|
|
+varTable:
|
|
|
+ // var table [maxTableSize]uint16
|
|
|
+ //
|
|
|
+ // In the asm code, unlike the Go code, we can zero-initialize only the
|
|
|
+ // first tableSize elements. Each uint16 element is 2 bytes and each
|
|
|
+ // iterations writes 64 bytes, so we can do only tableSize/32 writes
|
|
|
+ // instead of the 2048 writes that would zero-initialize all of table's
|
|
|
+ // 32768 bytes. This clear could overrun the first tableSize elements, but
|
|
|
+ // it won't overrun the allocated stack size.
|
|
|
+ ADD $128, RSP, R17
|
|
|
+ MOVD R17, R4
|
|
|
+
|
|
|
+ // !!! R6 = &src[tableSize]
|
|
|
+ ADD R6<<1, R17, R6
|
|
|
+
|
|
|
+memclr:
|
|
|
+ STP.P (ZR, ZR), 64(R4)
|
|
|
+ STP (ZR, ZR), -48(R4)
|
|
|
+ STP (ZR, ZR), -32(R4)
|
|
|
+ STP (ZR, ZR), -16(R4)
|
|
|
+ CMP R4, R6
|
|
|
+ BHI memclr
|
|
|
+
|
|
|
+ // !!! R6 = &src[0]
|
|
|
+ MOVD R7, R6
|
|
|
+
|
|
|
+ // sLimit := len(src) - inputMargin
|
|
|
+ MOVD R14, R9
|
|
|
+ SUB $15, R9, R9
|
|
|
+
|
|
|
+ // !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
|
|
|
+ // change for the rest of the function.
|
|
|
+ MOVD R5, 64(RSP)
|
|
|
+ MOVD R6, 72(RSP)
|
|
|
+ MOVD R9, 96(RSP)
|
|
|
+
|
|
|
+ // nextEmit := 0
|
|
|
+ MOVD R6, R10
|
|
|
+
|
|
|
+ // s := 1
|
|
|
+ ADD $1, R7, R7
|
|
|
+
|
|
|
+ // nextHash := hash(load32(src, s), shift)
|
|
|
+ MOVW 0(R7), R11
|
|
|
+ MULW R16, R11, R11
|
|
|
+ LSRW R5, R11, R11
|
|
|
+
|
|
|
+outer:
|
|
|
+ // for { etc }
|
|
|
+
|
|
|
+ // skip := 32
|
|
|
+ MOVD $32, R12
|
|
|
+
|
|
|
+ // nextS := s
|
|
|
+ MOVD R7, R13
|
|
|
+
|
|
|
+ // candidate := 0
|
|
|
+ MOVD $0, R15
|
|
|
+
|
|
|
+inner0:
|
|
|
+ // for { etc }
|
|
|
+
|
|
|
+ // s := nextS
|
|
|
+ MOVD R13, R7
|
|
|
+
|
|
|
+ // bytesBetweenHashLookups := skip >> 5
|
|
|
+ MOVD R12, R14
|
|
|
+ LSR $5, R14, R14
|
|
|
+
|
|
|
+ // nextS = s + bytesBetweenHashLookups
|
|
|
+ ADD R14, R13, R13
|
|
|
+
|
|
|
+ // skip += bytesBetweenHashLookups
|
|
|
+ ADD R14, R12, R12
|
|
|
+
|
|
|
+ // if nextS > sLimit { goto emitRemainder }
|
|
|
+ MOVD R13, R3
|
|
|
+ SUB R6, R3, R3
|
|
|
+ CMP R9, R3
|
|
|
+ BHI emitRemainder
|
|
|
+
|
|
|
+ // candidate = int(table[nextHash])
|
|
|
+ MOVHU 0(R17)(R11<<1), R15
|
|
|
+
|
|
|
+ // table[nextHash] = uint16(s)
|
|
|
+ MOVD R7, R3
|
|
|
+ SUB R6, R3, R3
|
|
|
+
|
|
|
+ MOVH R3, 0(R17)(R11<<1)
|
|
|
+
|
|
|
+ // nextHash = hash(load32(src, nextS), shift)
|
|
|
+ MOVW 0(R13), R11
|
|
|
+ MULW R16, R11
|
|
|
+ LSRW R5, R11, R11
|
|
|
+
|
|
|
+ // if load32(src, s) != load32(src, candidate) { continue } break
|
|
|
+ MOVW 0(R7), R3
|
|
|
+ MOVW (R6)(R15*1), R4
|
|
|
+ CMPW R4, R3
|
|
|
+ BNE inner0
|
|
|
+
|
|
|
+fourByteMatch:
|
|
|
+ // As per the encode_other.go code:
|
|
|
+ //
|
|
|
+ // A 4-byte match has been found. We'll later see etc.
|
|
|
+
|
|
|
+ // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
|
|
|
+ // on inputMargin in encode.go.
|
|
|
+ MOVD R7, R3
|
|
|
+ SUB R10, R3, R3
|
|
|
+ CMP $16, R3
|
|
|
+ BLE emitLiteralFastPath
|
|
|
+
|
|
|
+ // ----------------------------------------
|
|
|
+ // Begin inline of the emitLiteral call.
|
|
|
+ //
|
|
|
+ // d += emitLiteral(dst[d:], src[nextEmit:s])
|
|
|
+
|
|
|
+ MOVW R3, R4
|
|
|
+ SUBW $1, R4, R4
|
|
|
+
|
|
|
+ MOVW $60, R2
|
|
|
+ CMPW R2, R4
|
|
|
+ BLT inlineEmitLiteralOneByte
|
|
|
+ MOVW $256, R2
|
|
|
+ CMPW R2, R4
|
|
|
+ BLT inlineEmitLiteralTwoBytes
|
|
|
+
|
|
|
+inlineEmitLiteralThreeBytes:
|
|
|
+ MOVD $0xf4, R1
|
|
|
+ MOVB R1, 0(R8)
|
|
|
+ MOVW R4, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+ B inlineEmitLiteralMemmove
|
|
|
+
|
|
|
+inlineEmitLiteralTwoBytes:
|
|
|
+ MOVD $0xf0, R1
|
|
|
+ MOVB R1, 0(R8)
|
|
|
+ MOVB R4, 1(R8)
|
|
|
+ ADD $2, R8, R8
|
|
|
+ B inlineEmitLiteralMemmove
|
|
|
+
|
|
|
+inlineEmitLiteralOneByte:
|
|
|
+ LSLW $2, R4, R4
|
|
|
+ MOVB R4, 0(R8)
|
|
|
+ ADD $1, R8, R8
|
|
|
+
|
|
|
+inlineEmitLiteralMemmove:
|
|
|
+ // Spill local variables (registers) onto the stack; call; unspill.
|
|
|
+ //
|
|
|
+ // copy(dst[i:], lit)
|
|
|
+ //
|
|
|
+ // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
|
|
|
+ // R8, R10 and R3 as arguments.
|
|
|
+ MOVD R8, 8(RSP)
|
|
|
+ MOVD R10, 16(RSP)
|
|
|
+ MOVD R3, 24(RSP)
|
|
|
+
|
|
|
+ // Finish the "d +=" part of "d += emitLiteral(etc)".
|
|
|
+ ADD R3, R8, R8
|
|
|
+ MOVD R7, 80(RSP)
|
|
|
+ MOVD R8, 88(RSP)
|
|
|
+ MOVD R15, 120(RSP)
|
|
|
+ CALL runtime·memmove(SB)
|
|
|
+ MOVD 64(RSP), R5
|
|
|
+ MOVD 72(RSP), R6
|
|
|
+ MOVD 80(RSP), R7
|
|
|
+ MOVD 88(RSP), R8
|
|
|
+ MOVD 96(RSP), R9
|
|
|
+ MOVD 120(RSP), R15
|
|
|
+ ADD $128, RSP, R17
|
|
|
+ MOVW $0xa7bd, R16
|
|
|
+ MOVKW $(0x1e35<<16), R16
|
|
|
+ B inner1
|
|
|
+
|
|
|
+inlineEmitLiteralEnd:
|
|
|
+ // End inline of the emitLiteral call.
|
|
|
+ // ----------------------------------------
|
|
|
+
|
|
|
+emitLiteralFastPath:
|
|
|
+ // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
|
|
|
+ MOVB R3, R4
|
|
|
+ SUBW $1, R4, R4
|
|
|
+ AND $0xff, R4, R4
|
|
|
+ LSLW $2, R4, R4
|
|
|
+ MOVB R4, (R8)
|
|
|
+ ADD $1, R8, R8
|
|
|
+
|
|
|
+ // !!! Implement the copy from lit to dst as a 16-byte load and store.
|
|
|
+ // (Encode's documentation says that dst and src must not overlap.)
|
|
|
+ //
|
|
|
+ // This always copies 16 bytes, instead of only len(lit) bytes, but that's
|
|
|
+ // OK. Subsequent iterations will fix up the overrun.
|
|
|
+ //
|
|
|
+ // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
|
|
|
+ // 16-byte loads and stores. This technique probably wouldn't be as
|
|
|
+ // effective on architectures that are fussier about alignment.
|
|
|
+ LDP 0(R10), (R0, R1)
|
|
|
+ STP (R0, R1), 0(R8)
|
|
|
+ ADD R3, R8, R8
|
|
|
+
|
|
|
+inner1:
|
|
|
+ // for { etc }
|
|
|
+
|
|
|
+ // base := s
|
|
|
+ MOVD R7, R12
|
|
|
+
|
|
|
+ // !!! offset := base - candidate
|
|
|
+ MOVD R12, R11
|
|
|
+ SUB R15, R11, R11
|
|
|
+ SUB R6, R11, R11
|
|
|
+
|
|
|
+ // ----------------------------------------
|
|
|
+ // Begin inline of the extendMatch call.
|
|
|
+ //
|
|
|
+ // s = extendMatch(src, candidate+4, s+4)
|
|
|
+
|
|
|
+ // !!! R14 = &src[len(src)]
|
|
|
+ MOVD src_len+32(FP), R14
|
|
|
+ ADD R6, R14, R14
|
|
|
+
|
|
|
+ // !!! R13 = &src[len(src) - 8]
|
|
|
+ MOVD R14, R13
|
|
|
+ SUB $8, R13, R13
|
|
|
+
|
|
|
+ // !!! R15 = &src[candidate + 4]
|
|
|
+ ADD $4, R15, R15
|
|
|
+ ADD R6, R15, R15
|
|
|
+
|
|
|
+ // !!! s += 4
|
|
|
+ ADD $4, R7, R7
|
|
|
+
|
|
|
+inlineExtendMatchCmp8:
|
|
|
+ // As long as we are 8 or more bytes before the end of src, we can load and
|
|
|
+ // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
|
|
|
+ CMP R13, R7
|
|
|
+ BHI inlineExtendMatchCmp1
|
|
|
+ MOVD (R15), R3
|
|
|
+ MOVD (R7), R4
|
|
|
+ CMP R4, R3
|
|
|
+ BNE inlineExtendMatchBSF
|
|
|
+ ADD $8, R15, R15
|
|
|
+ ADD $8, R7, R7
|
|
|
+ B inlineExtendMatchCmp8
|
|
|
+
|
|
|
+inlineExtendMatchBSF:
|
|
|
+ // If those 8 bytes were not equal, XOR the two 8 byte values, and return
|
|
|
+ // the index of the first byte that differs.
|
|
|
+ // RBIT reverses the bit order, then CLZ counts the leading zeros, the
|
|
|
+ // combination of which finds the least significant bit which is set.
|
|
|
+ // The arm64 architecture is little-endian, and the shift by 3 converts
|
|
|
+ // a bit index to a byte index.
|
|
|
+ EOR R3, R4, R4
|
|
|
+ RBIT R4, R4
|
|
|
+ CLZ R4, R4
|
|
|
+ ADD R4>>3, R7, R7
|
|
|
+ B inlineExtendMatchEnd
|
|
|
+
|
|
|
+inlineExtendMatchCmp1:
|
|
|
+ // In src's tail, compare 1 byte at a time.
|
|
|
+ CMP R7, R14
|
|
|
+ BLS inlineExtendMatchEnd
|
|
|
+ MOVB (R15), R3
|
|
|
+ MOVB (R7), R4
|
|
|
+ CMP R4, R3
|
|
|
+ BNE inlineExtendMatchEnd
|
|
|
+ ADD $1, R15, R15
|
|
|
+ ADD $1, R7, R7
|
|
|
+ B inlineExtendMatchCmp1
|
|
|
+
|
|
|
+inlineExtendMatchEnd:
|
|
|
+ // End inline of the extendMatch call.
|
|
|
+ // ----------------------------------------
|
|
|
+
|
|
|
+ // ----------------------------------------
|
|
|
+ // Begin inline of the emitCopy call.
|
|
|
+ //
|
|
|
+ // d += emitCopy(dst[d:], base-candidate, s-base)
|
|
|
+
|
|
|
+ // !!! length := s - base
|
|
|
+ MOVD R7, R3
|
|
|
+ SUB R12, R3, R3
|
|
|
+
|
|
|
+inlineEmitCopyLoop0:
|
|
|
+ // for length >= 68 { etc }
|
|
|
+ MOVW $68, R2
|
|
|
+ CMPW R2, R3
|
|
|
+ BLT inlineEmitCopyStep1
|
|
|
+
|
|
|
+ // Emit a length 64 copy, encoded as 3 bytes.
|
|
|
+ MOVD $0xfe, R1
|
|
|
+ MOVB R1, 0(R8)
|
|
|
+ MOVW R11, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+ SUBW $64, R3, R3
|
|
|
+ B inlineEmitCopyLoop0
|
|
|
+
|
|
|
+inlineEmitCopyStep1:
|
|
|
+ // if length > 64 { etc }
|
|
|
+ MOVW $64, R2
|
|
|
+ CMPW R2, R3
|
|
|
+ BLE inlineEmitCopyStep2
|
|
|
+
|
|
|
+ // Emit a length 60 copy, encoded as 3 bytes.
|
|
|
+ MOVD $0xee, R1
|
|
|
+ MOVB R1, 0(R8)
|
|
|
+ MOVW R11, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+ SUBW $60, R3, R3
|
|
|
+
|
|
|
+inlineEmitCopyStep2:
|
|
|
+ // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
|
|
|
+ MOVW $12, R2
|
|
|
+ CMPW R2, R3
|
|
|
+ BGE inlineEmitCopyStep3
|
|
|
+ MOVW $2048, R2
|
|
|
+ CMPW R2, R11
|
|
|
+ BGE inlineEmitCopyStep3
|
|
|
+
|
|
|
+ // Emit the remaining copy, encoded as 2 bytes.
|
|
|
+ MOVB R11, 1(R8)
|
|
|
+ LSRW $8, R11, R11
|
|
|
+ LSLW $5, R11, R11
|
|
|
+ SUBW $4, R3, R3
|
|
|
+ AND $0xff, R3, R3
|
|
|
+ LSLW $2, R3, R3
|
|
|
+ ORRW R3, R11, R11
|
|
|
+ ORRW $1, R11, R11
|
|
|
+ MOVB R11, 0(R8)
|
|
|
+ ADD $2, R8, R8
|
|
|
+ B inlineEmitCopyEnd
|
|
|
+
|
|
|
+inlineEmitCopyStep3:
|
|
|
+ // Emit the remaining copy, encoded as 3 bytes.
|
|
|
+ SUBW $1, R3, R3
|
|
|
+ LSLW $2, R3, R3
|
|
|
+ ORRW $2, R3, R3
|
|
|
+ MOVB R3, 0(R8)
|
|
|
+ MOVW R11, 1(R8)
|
|
|
+ ADD $3, R8, R8
|
|
|
+
|
|
|
+inlineEmitCopyEnd:
|
|
|
+ // End inline of the emitCopy call.
|
|
|
+ // ----------------------------------------
|
|
|
+
|
|
|
+ // nextEmit = s
|
|
|
+ MOVD R7, R10
|
|
|
+
|
|
|
+ // if s >= sLimit { goto emitRemainder }
|
|
|
+ MOVD R7, R3
|
|
|
+ SUB R6, R3, R3
|
|
|
+ CMP R3, R9
|
|
|
+ BLS emitRemainder
|
|
|
+
|
|
|
+ // As per the encode_other.go code:
|
|
|
+ //
|
|
|
+ // We could immediately etc.
|
|
|
+
|
|
|
+ // x := load64(src, s-1)
|
|
|
+ MOVD -1(R7), R14
|
|
|
+
|
|
|
+ // prevHash := hash(uint32(x>>0), shift)
|
|
|
+ MOVW R14, R11
|
|
|
+ MULW R16, R11, R11
|
|
|
+ LSRW R5, R11, R11
|
|
|
+
|
|
|
+ // table[prevHash] = uint16(s-1)
|
|
|
+ MOVD R7, R3
|
|
|
+ SUB R6, R3, R3
|
|
|
+ SUB $1, R3, R3
|
|
|
+
|
|
|
+ MOVHU R3, 0(R17)(R11<<1)
|
|
|
+
|
|
|
+ // currHash := hash(uint32(x>>8), shift)
|
|
|
+ LSR $8, R14, R14
|
|
|
+ MOVW R14, R11
|
|
|
+ MULW R16, R11, R11
|
|
|
+ LSRW R5, R11, R11
|
|
|
+
|
|
|
+ // candidate = int(table[currHash])
|
|
|
+ MOVHU 0(R17)(R11<<1), R15
|
|
|
+
|
|
|
+ // table[currHash] = uint16(s)
|
|
|
+ ADD $1, R3, R3
|
|
|
+ MOVHU R3, 0(R17)(R11<<1)
|
|
|
+
|
|
|
+ // if uint32(x>>8) == load32(src, candidate) { continue }
|
|
|
+ MOVW (R6)(R15*1), R4
|
|
|
+ CMPW R4, R14
|
|
|
+ BEQ inner1
|
|
|
+
|
|
|
+ // nextHash = hash(uint32(x>>16), shift)
|
|
|
+ LSR $8, R14, R14
|
|
|
+ MOVW R14, R11
|
|
|
+ MULW R16, R11, R11
|
|
|
+ LSRW R5, R11, R11
|
|
|
+
|
|
|
+ // s++
|
|
|
+ ADD $1, R7, R7
|
|
|
+
|
|
|
+ // break out of the inner1 for loop, i.e. continue the outer loop.
|
|
|
+ B outer
|
|
|
+
|
|
|
+emitRemainder:
|
|
|
+ // if nextEmit < len(src) { etc }
|
|
|
+ MOVD src_len+32(FP), R3
|
|
|
+ ADD R6, R3, R3
|
|
|
+ CMP R3, R10
|
|
|
+ BEQ encodeBlockEnd
|
|
|
+
|
|
|
+ // d += emitLiteral(dst[d:], src[nextEmit:])
|
|
|
+ //
|
|
|
+ // Push args.
|
|
|
+ MOVD R8, 8(RSP)
|
|
|
+ MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative.
|
|
|
+ MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative.
|
|
|
+ MOVD R10, 32(RSP)
|
|
|
+ SUB R10, R3, R3
|
|
|
+ MOVD R3, 40(RSP)
|
|
|
+ MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative.
|
|
|
+
|
|
|
+ // Spill local variables (registers) onto the stack; call; unspill.
|
|
|
+ MOVD R8, 88(RSP)
|
|
|
+ CALL ·emitLiteral(SB)
|
|
|
+ MOVD 88(RSP), R8
|
|
|
+
|
|
|
+ // Finish the "d +=" part of "d += emitLiteral(etc)".
|
|
|
+ MOVD 56(RSP), R1
|
|
|
+ ADD R1, R8, R8
|
|
|
+
|
|
|
+encodeBlockEnd:
|
|
|
+ MOVD dst_base+0(FP), R3
|
|
|
+ SUB R3, R8, R8
|
|
|
+ MOVD R8, d+48(FP)
|
|
|
+ RET
|