|
@@ -0,0 +1,4100 @@
|
|
|
|
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
|
|
|
|
+
|
|
|
|
+//go:build !appengine && !noasm && gc && !noasm
|
|
|
|
+// +build !appengine,!noasm,gc,!noasm
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
|
|
+// Requires: CMOV
|
|
|
|
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ 32(AX), DX
|
|
|
|
+ MOVBQZX 40(AX), BX
|
|
|
|
+ MOVQ 24(AX), SI
|
|
|
|
+ MOVQ (AX), AX
|
|
|
|
+ ADDQ SI, AX
|
|
|
|
+ MOVQ AX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 72(AX), DI
|
|
|
|
+ MOVQ 80(AX), R8
|
|
|
|
+ MOVQ 88(AX), R9
|
|
|
|
+ MOVQ 104(AX), R10
|
|
|
|
+ MOVQ s+0(FP), AX
|
|
|
|
+ MOVQ 144(AX), R11
|
|
|
|
+ MOVQ 152(AX), R12
|
|
|
|
+ MOVQ 160(AX), R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_main_loop:
|
|
|
|
+ MOVQ (SP), R14
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ SI, $0x08
|
|
|
|
+ JL sequenceDecs_decode_amd64_fill_byte_by_byte
|
|
|
|
+ MOVQ BX, AX
|
|
|
|
+ SHRQ $0x03, AX
|
|
|
|
+ SUBQ AX, R14
|
|
|
|
+ MOVQ (R14), DX
|
|
|
|
+ SUBQ AX, SI
|
|
|
|
+ ANDQ $0x07, BX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_fill_byte_by_byte:
|
|
|
|
+ CMPQ SI, $0x00
|
|
|
|
+ JLE sequenceDecs_decode_amd64_fill_end
|
|
|
|
+ CMPQ BX, $0x07
|
|
|
|
+ JLE sequenceDecs_decode_amd64_fill_end
|
|
|
|
+ SHLQ $0x08, DX
|
|
|
|
+ SUBQ $0x01, R14
|
|
|
|
+ SUBQ $0x01, SI
|
|
|
|
+ SUBQ $0x08, BX
|
|
|
|
+ MOVBQZX (R14), AX
|
|
|
|
+ ORQ AX, DX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ SHLQ CL, R15
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decode_amd64_of_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decode_amd64_of_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decode_amd64_of_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R15
|
|
|
|
+ ADDQ R15, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_of_update_zero:
|
|
|
|
+ MOVQ AX, 16(R10)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ R8, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ SHLQ CL, R15
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decode_amd64_ml_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decode_amd64_ml_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decode_amd64_ml_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R15
|
|
|
|
+ ADDQ R15, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_ml_update_zero:
|
|
|
|
+ MOVQ AX, 8(R10)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the remaining
|
|
|
|
+ CMPQ SI, $0x08
|
|
|
|
+ JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
|
|
|
|
+ MOVQ BX, AX
|
|
|
|
+ SHRQ $0x03, AX
|
|
|
|
+ SUBQ AX, R14
|
|
|
|
+ MOVQ (R14), DX
|
|
|
|
+ SUBQ AX, SI
|
|
|
|
+ ANDQ $0x07, BX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_fill_2_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
|
|
|
|
+ CMPQ SI, $0x00
|
|
|
|
+ JLE sequenceDecs_decode_amd64_fill_2_end
|
|
|
|
+ CMPQ BX, $0x07
|
|
|
|
+ JLE sequenceDecs_decode_amd64_fill_2_end
|
|
|
|
+ SHLQ $0x08, DX
|
|
|
|
+ SUBQ $0x01, R14
|
|
|
|
+ SUBQ $0x01, SI
|
|
|
|
+ SUBQ $0x08, BX
|
|
|
|
+ MOVBQZX (R14), AX
|
|
|
|
+ ORQ AX, DX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_fill_2_end:
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ DI, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ SHLQ CL, R15
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decode_amd64_ll_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decode_amd64_ll_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decode_amd64_ll_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R15
|
|
|
|
+ ADDQ R15, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_ll_update_zero:
|
|
|
|
+ MOVQ AX, (R10)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R14, (SP)
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ SHRQ $0x08, AX
|
|
|
|
+ MOVBQZX AL, AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decode_amd64_skip_update
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ MOVBQZX DI, R14
|
|
|
|
+ SHRQ $0x10, DI
|
|
|
|
+ MOVWQZX DI, DI
|
|
|
|
+ LEAQ (BX)(R14*1), CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ MOVL $0x00000001, BP
|
|
|
|
+ MOVB R14, CL
|
|
|
|
+ SHLL CL, BP
|
|
|
|
+ DECL BP
|
|
|
|
+ ANDQ BP, R15
|
|
|
|
+ ADDQ R15, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ MOVBQZX R8, R14
|
|
|
|
+ SHRQ $0x10, R8
|
|
|
|
+ MOVWQZX R8, R8
|
|
|
|
+ LEAQ (BX)(R14*1), CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ MOVL $0x00000001, BP
|
|
|
|
+ MOVB R14, CL
|
|
|
|
+ SHLL CL, BP
|
|
|
|
+ DECL BP
|
|
|
|
+ ANDQ BP, R15
|
|
|
|
+ ADDQ R15, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ MOVBQZX R9, R14
|
|
|
|
+ SHRQ $0x10, R9
|
|
|
|
+ MOVWQZX R9, R9
|
|
|
|
+ LEAQ (BX)(R14*1), CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ MOVL $0x00000001, BP
|
|
|
|
+ MOVB R14, CL
|
|
|
|
+ SHLL CL, BP
|
|
|
|
+ DECL BP
|
|
|
|
+ ANDQ BP, R15
|
|
|
|
+ ADDQ R15, R9
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R9*8), R9
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ 16(R10), CX
|
|
|
|
+ CMPQ AX, $0x01
|
|
|
|
+ JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
|
|
|
|
+ MOVQ R12, R13
|
|
|
|
+ MOVQ R11, R12
|
|
|
|
+ MOVQ CX, R11
|
|
|
|
+ JMP sequenceDecs_decode_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ (R10), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
|
|
|
|
+ INCQ CX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
|
|
|
|
+ MOVQ R11, CX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ JB sequenceDecs_decode_amd64_adjust_zero
|
|
|
|
+ JEQ sequenceDecs_decode_amd64_adjust_one
|
|
|
|
+ CMPQ CX, $0x02
|
|
|
|
+ JA sequenceDecs_decode_amd64_adjust_three
|
|
|
|
+ JMP sequenceDecs_decode_amd64_adjust_two
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_zero:
|
|
|
|
+ MOVQ R11, AX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_one:
|
|
|
|
+ MOVQ R12, AX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_two:
|
|
|
|
+ MOVQ R13, AX
|
|
|
|
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_three:
|
|
|
|
+ LEAQ -1(R11), AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JNZ sequenceDecs_decode_amd64_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_adjust_temp_valid:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ CMOVQNE R12, R13
|
|
|
|
+ MOVQ R11, R12
|
|
|
|
+ MOVQ AX, R11
|
|
|
|
+ MOVQ AX, CX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_after_adjust:
|
|
|
|
+ MOVQ CX, 16(R10)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 8(R10), AX
|
|
|
|
+ MOVQ (R10), R14
|
|
|
|
+ LEAQ (AX)(R14*1), R15
|
|
|
|
+ MOVQ s+0(FP), BP
|
|
|
|
+ ADDQ R15, 256(BP)
|
|
|
|
+ MOVQ ctx+16(FP), R15
|
|
|
|
+ SUBQ R14, 128(R15)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ AX, $0x00020002
|
|
|
|
+ JA sequenceDecs_decode_amd64_error_match_len_too_big
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_amd64_match_len_ofs_ok:
|
|
|
|
+ ADDQ $0x18, R10
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ DECQ 96(AX)
|
|
|
|
+ JNS sequenceDecs_decode_amd64_main_loop
|
|
|
|
+ MOVQ s+0(FP), AX
|
|
|
|
+ MOVQ R11, 144(AX)
|
|
|
|
+ MOVQ R12, 152(AX)
|
|
|
|
+ MOVQ R13, 160(AX)
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ DX, 32(AX)
|
|
|
|
+ MOVB BL, 40(AX)
|
|
|
|
+ MOVQ SI, 24(AX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decode_amd64_error_match_len_too_big:
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
|
|
+// Requires: CMOV
|
|
|
|
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ 32(AX), DX
|
|
|
|
+ MOVBQZX 40(AX), BX
|
|
|
|
+ MOVQ 24(AX), SI
|
|
|
|
+ MOVQ (AX), AX
|
|
|
|
+ ADDQ SI, AX
|
|
|
|
+ MOVQ AX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 72(AX), DI
|
|
|
|
+ MOVQ 80(AX), R8
|
|
|
|
+ MOVQ 88(AX), R9
|
|
|
|
+ MOVQ 104(AX), R10
|
|
|
|
+ MOVQ s+0(FP), AX
|
|
|
|
+ MOVQ 144(AX), R11
|
|
|
|
+ MOVQ 152(AX), R12
|
|
|
|
+ MOVQ 160(AX), R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_main_loop:
|
|
|
|
+ MOVQ (SP), R14
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ SI, $0x08
|
|
|
|
+ JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
|
|
|
|
+ MOVQ BX, AX
|
|
|
|
+ SHRQ $0x03, AX
|
|
|
|
+ SUBQ AX, R14
|
|
|
|
+ MOVQ (R14), DX
|
|
|
|
+ SUBQ AX, SI
|
|
|
|
+ ANDQ $0x07, BX
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
|
|
|
|
+ CMPQ SI, $0x00
|
|
|
|
+ JLE sequenceDecs_decode_56_amd64_fill_end
|
|
|
|
+ CMPQ BX, $0x07
|
|
|
|
+ JLE sequenceDecs_decode_56_amd64_fill_end
|
|
|
|
+ SHLQ $0x08, DX
|
|
|
|
+ SUBQ $0x01, R14
|
|
|
|
+ SUBQ $0x01, SI
|
|
|
|
+ SUBQ $0x08, BX
|
|
|
|
+ MOVBQZX (R14), AX
|
|
|
|
+ ORQ AX, DX
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ SHLQ CL, R15
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decode_56_amd64_of_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decode_56_amd64_of_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decode_56_amd64_of_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R15
|
|
|
|
+ ADDQ R15, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_of_update_zero:
|
|
|
|
+ MOVQ AX, 16(R10)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ R8, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ SHLQ CL, R15
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decode_56_amd64_ml_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decode_56_amd64_ml_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decode_56_amd64_ml_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R15
|
|
|
|
+ ADDQ R15, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_ml_update_zero:
|
|
|
|
+ MOVQ AX, 8(R10)
|
|
|
|
+
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ DI, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ SHLQ CL, R15
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decode_56_amd64_ll_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decode_56_amd64_ll_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decode_56_amd64_ll_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R15
|
|
|
|
+ ADDQ R15, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_ll_update_zero:
|
|
|
|
+ MOVQ AX, (R10)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R14, (SP)
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ SHRQ $0x08, AX
|
|
|
|
+ MOVBQZX AL, AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decode_56_amd64_skip_update
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ MOVBQZX DI, R14
|
|
|
|
+ SHRQ $0x10, DI
|
|
|
|
+ MOVWQZX DI, DI
|
|
|
|
+ LEAQ (BX)(R14*1), CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ MOVL $0x00000001, BP
|
|
|
|
+ MOVB R14, CL
|
|
|
|
+ SHLL CL, BP
|
|
|
|
+ DECL BP
|
|
|
|
+ ANDQ BP, R15
|
|
|
|
+ ADDQ R15, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ MOVBQZX R8, R14
|
|
|
|
+ SHRQ $0x10, R8
|
|
|
|
+ MOVWQZX R8, R8
|
|
|
|
+ LEAQ (BX)(R14*1), CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ MOVL $0x00000001, BP
|
|
|
|
+ MOVB R14, CL
|
|
|
|
+ SHLL CL, BP
|
|
|
|
+ DECL BP
|
|
|
|
+ ANDQ BP, R15
|
|
|
|
+ ADDQ R15, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ MOVBQZX R9, R14
|
|
|
|
+ SHRQ $0x10, R9
|
|
|
|
+ MOVWQZX R9, R9
|
|
|
|
+ LEAQ (BX)(R14*1), CX
|
|
|
|
+ MOVQ DX, R15
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ MOVL $0x00000001, BP
|
|
|
|
+ MOVB R14, CL
|
|
|
|
+ SHLL CL, BP
|
|
|
|
+ DECL BP
|
|
|
|
+ ANDQ BP, R15
|
|
|
|
+ ADDQ R15, R9
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R9*8), R9
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ 16(R10), CX
|
|
|
|
+ CMPQ AX, $0x01
|
|
|
|
+ JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
|
|
|
|
+ MOVQ R12, R13
|
|
|
|
+ MOVQ R11, R12
|
|
|
|
+ MOVQ CX, R11
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ (R10), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
|
|
|
|
+ INCQ CX
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
|
|
|
|
+ MOVQ R11, CX
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ JB sequenceDecs_decode_56_amd64_adjust_zero
|
|
|
|
+ JEQ sequenceDecs_decode_56_amd64_adjust_one
|
|
|
|
+ CMPQ CX, $0x02
|
|
|
|
+ JA sequenceDecs_decode_56_amd64_adjust_three
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_adjust_two
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_zero:
|
|
|
|
+ MOVQ R11, AX
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_one:
|
|
|
|
+ MOVQ R12, AX
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_two:
|
|
|
|
+ MOVQ R13, AX
|
|
|
|
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_three:
|
|
|
|
+ LEAQ -1(R11), AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ CMOVQNE R12, R13
|
|
|
|
+ MOVQ R11, R12
|
|
|
|
+ MOVQ AX, R11
|
|
|
|
+ MOVQ AX, CX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_after_adjust:
|
|
|
|
+ MOVQ CX, 16(R10)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 8(R10), AX
|
|
|
|
+ MOVQ (R10), R14
|
|
|
|
+ LEAQ (AX)(R14*1), R15
|
|
|
|
+ MOVQ s+0(FP), BP
|
|
|
|
+ ADDQ R15, 256(BP)
|
|
|
|
+ MOVQ ctx+16(FP), R15
|
|
|
|
+ SUBQ R14, 128(R15)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ AX, $0x00020002
|
|
|
|
+ JA sequenceDecs_decode_56_amd64_error_match_len_too_big
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
|
|
|
|
+ ADDQ $0x18, R10
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ DECQ 96(AX)
|
|
|
|
+ JNS sequenceDecs_decode_56_amd64_main_loop
|
|
|
|
+ MOVQ s+0(FP), AX
|
|
|
|
+ MOVQ R11, 144(AX)
|
|
|
|
+ MOVQ R12, 152(AX)
|
|
|
|
+ MOVQ R13, 160(AX)
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ DX, 32(AX)
|
|
|
|
+ MOVB BL, 40(AX)
|
|
|
|
+ MOVQ SI, 24(AX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
|
|
+// Requires: BMI, BMI2, CMOV
|
|
|
|
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ 32(CX), AX
|
|
|
|
+ MOVBQZX 40(CX), DX
|
|
|
|
+ MOVQ 24(CX), BX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ ADDQ BX, CX
|
|
|
|
+ MOVQ CX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 72(CX), SI
|
|
|
|
+ MOVQ 80(CX), DI
|
|
|
|
+ MOVQ 88(CX), R8
|
|
|
|
+ MOVQ 104(CX), R9
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ 144(CX), R10
|
|
|
|
+ MOVQ 152(CX), R11
|
|
|
|
+ MOVQ 160(CX), R12
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_main_loop:
|
|
|
|
+ MOVQ (SP), R13
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ BX, $0x08
|
|
|
|
+ JL sequenceDecs_decode_bmi2_fill_byte_by_byte
|
|
|
|
+ MOVQ DX, CX
|
|
|
|
+ SHRQ $0x03, CX
|
|
|
|
+ SUBQ CX, R13
|
|
|
|
+ MOVQ (R13), AX
|
|
|
|
+ SUBQ CX, BX
|
|
|
|
+ ANDQ $0x07, DX
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
|
|
|
|
+ CMPQ BX, $0x00
|
|
|
|
+ JLE sequenceDecs_decode_bmi2_fill_end
|
|
|
|
+ CMPQ DX, $0x07
|
|
|
|
+ JLE sequenceDecs_decode_bmi2_fill_end
|
|
|
|
+ SHLQ $0x08, AX
|
|
|
|
+ SUBQ $0x01, R13
|
|
|
|
+ SUBQ $0x01, BX
|
|
|
|
+ SUBQ $0x08, DX
|
|
|
|
+ MOVBQZX (R13), CX
|
|
|
|
+ ORQ CX, AX
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R14
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ R8, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R15, CX
|
|
|
|
+ MOVQ CX, 16(R9)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, DI, R14
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ DI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R15, CX
|
|
|
|
+ MOVQ CX, 8(R9)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the remaining
|
|
|
|
+ CMPQ BX, $0x08
|
|
|
|
+ JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
|
|
|
|
+ MOVQ DX, CX
|
|
|
|
+ SHRQ $0x03, CX
|
|
|
|
+ SUBQ CX, R13
|
|
|
|
+ MOVQ (R13), AX
|
|
|
|
+ SUBQ CX, BX
|
|
|
|
+ ANDQ $0x07, DX
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_fill_2_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
|
|
|
|
+ CMPQ BX, $0x00
|
|
|
|
+ JLE sequenceDecs_decode_bmi2_fill_2_end
|
|
|
|
+ CMPQ DX, $0x07
|
|
|
|
+ JLE sequenceDecs_decode_bmi2_fill_2_end
|
|
|
|
+ SHLQ $0x08, AX
|
|
|
|
+ SUBQ $0x01, R13
|
|
|
|
+ SUBQ $0x01, BX
|
|
|
|
+ SUBQ $0x08, DX
|
|
|
|
+ MOVBQZX (R13), CX
|
|
|
|
+ ORQ CX, AX
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_fill_2_end:
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, SI, R14
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ SI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R15, CX
|
|
|
|
+ MOVQ CX, (R9)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R13, (SP)
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R13
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decode_bmi2_skip_update
|
|
|
|
+ LEAQ (SI)(DI*1), R14
|
|
|
|
+ ADDQ R8, R14
|
|
|
|
+ MOVBQZX R14, R14
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ BZHIQ R8, R15, CX
|
|
|
|
+ SHRXQ R8, R15, R15
|
|
|
|
+ MOVQ $0x00001010, R14
|
|
|
|
+ BEXTRQ R14, R8, R8
|
|
|
|
+ ADDQ CX, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ BZHIQ DI, R15, CX
|
|
|
|
+ SHRXQ DI, R15, R15
|
|
|
|
+ MOVQ $0x00001010, R14
|
|
|
|
+ BEXTRQ R14, DI, DI
|
|
|
|
+ ADDQ CX, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ BZHIQ SI, R15, CX
|
|
|
|
+ MOVQ $0x00001010, R14
|
|
|
|
+ BEXTRQ R14, SI, SI
|
|
|
|
+ ADDQ CX, SI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(SI*8), SI
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ 16(R9), CX
|
|
|
|
+ CMPQ R13, $0x01
|
|
|
|
+ JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
|
|
|
|
+ MOVQ R11, R12
|
|
|
|
+ MOVQ R10, R11
|
|
|
|
+ MOVQ CX, R10
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ (R9), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
|
|
|
|
+ INCQ CX
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
|
|
|
|
+ MOVQ R10, CX
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ JB sequenceDecs_decode_bmi2_adjust_zero
|
|
|
|
+ JEQ sequenceDecs_decode_bmi2_adjust_one
|
|
|
|
+ CMPQ CX, $0x02
|
|
|
|
+ JA sequenceDecs_decode_bmi2_adjust_three
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_adjust_two
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_zero:
|
|
|
|
+ MOVQ R10, R13
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_one:
|
|
|
|
+ MOVQ R11, R13
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_two:
|
|
|
|
+ MOVQ R12, R13
|
|
|
|
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_three:
|
|
|
|
+ LEAQ -1(R10), R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_adjust_temp_valid:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ CMOVQNE R11, R12
|
|
|
|
+ MOVQ R10, R11
|
|
|
|
+ MOVQ R13, R10
|
|
|
|
+ MOVQ R13, CX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_after_adjust:
|
|
|
|
+ MOVQ CX, 16(R9)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 8(R9), R13
|
|
|
|
+ MOVQ (R9), R14
|
|
|
|
+ LEAQ (R13)(R14*1), R15
|
|
|
|
+ MOVQ s+0(FP), BP
|
|
|
|
+ ADDQ R15, 256(BP)
|
|
|
|
+ MOVQ ctx+16(FP), R15
|
|
|
|
+ SUBQ R14, 128(R15)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ R13, $0x00020002
|
|
|
|
+ JA sequenceDecs_decode_bmi2_error_match_len_too_big
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
|
|
|
|
+ ADDQ $0x18, R9
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ DECQ 96(CX)
|
|
|
|
+ JNS sequenceDecs_decode_bmi2_main_loop
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ R10, 144(CX)
|
|
|
|
+ MOVQ R11, 152(CX)
|
|
|
|
+ MOVQ R12, 160(CX)
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ AX, 32(CX)
|
|
|
|
+ MOVB DL, 40(CX)
|
|
|
|
+ MOVQ BX, 24(CX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decode_bmi2_error_match_len_too_big:
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
|
|
+// Requires: BMI, BMI2, CMOV
|
|
|
|
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ 32(CX), AX
|
|
|
|
+ MOVBQZX 40(CX), DX
|
|
|
|
+ MOVQ 24(CX), BX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ ADDQ BX, CX
|
|
|
|
+ MOVQ CX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 72(CX), SI
|
|
|
|
+ MOVQ 80(CX), DI
|
|
|
|
+ MOVQ 88(CX), R8
|
|
|
|
+ MOVQ 104(CX), R9
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ 144(CX), R10
|
|
|
|
+ MOVQ 152(CX), R11
|
|
|
|
+ MOVQ 160(CX), R12
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_main_loop:
|
|
|
|
+ MOVQ (SP), R13
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ BX, $0x08
|
|
|
|
+ JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
|
|
|
|
+ MOVQ DX, CX
|
|
|
|
+ SHRQ $0x03, CX
|
|
|
|
+ SUBQ CX, R13
|
|
|
|
+ MOVQ (R13), AX
|
|
|
|
+ SUBQ CX, BX
|
|
|
|
+ ANDQ $0x07, DX
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
|
|
|
|
+ CMPQ BX, $0x00
|
|
|
|
+ JLE sequenceDecs_decode_56_bmi2_fill_end
|
|
|
|
+ CMPQ DX, $0x07
|
|
|
|
+ JLE sequenceDecs_decode_56_bmi2_fill_end
|
|
|
|
+ SHLQ $0x08, AX
|
|
|
|
+ SUBQ $0x01, R13
|
|
|
|
+ SUBQ $0x01, BX
|
|
|
|
+ SUBQ $0x08, DX
|
|
|
|
+ MOVBQZX (R13), CX
|
|
|
|
+ ORQ CX, AX
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R14
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ R8, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R15, CX
|
|
|
|
+ MOVQ CX, 16(R9)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, DI, R14
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ DI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R15, CX
|
|
|
|
+ MOVQ CX, 8(R9)
|
|
|
|
+
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, SI, R14
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ SI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R15, CX
|
|
|
|
+ MOVQ CX, (R9)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R13, (SP)
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R13
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decode_56_bmi2_skip_update
|
|
|
|
+ LEAQ (SI)(DI*1), R14
|
|
|
|
+ ADDQ R8, R14
|
|
|
|
+ MOVBQZX R14, R14
|
|
|
|
+ LEAQ (DX)(R14*1), CX
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ ROLQ CL, R15
|
|
|
|
+ BZHIQ R14, R15, R15
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ BZHIQ R8, R15, CX
|
|
|
|
+ SHRXQ R8, R15, R15
|
|
|
|
+ MOVQ $0x00001010, R14
|
|
|
|
+ BEXTRQ R14, R8, R8
|
|
|
|
+ ADDQ CX, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ BZHIQ DI, R15, CX
|
|
|
|
+ SHRXQ DI, R15, R15
|
|
|
|
+ MOVQ $0x00001010, R14
|
|
|
|
+ BEXTRQ R14, DI, DI
|
|
|
|
+ ADDQ CX, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ BZHIQ SI, R15, CX
|
|
|
|
+ MOVQ $0x00001010, R14
|
|
|
|
+ BEXTRQ R14, SI, SI
|
|
|
|
+ ADDQ CX, SI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(SI*8), SI
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ 16(R9), CX
|
|
|
|
+ CMPQ R13, $0x01
|
|
|
|
+ JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
|
|
|
|
+ MOVQ R11, R12
|
|
|
|
+ MOVQ R10, R11
|
|
|
|
+ MOVQ CX, R10
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ (R9), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
|
|
|
|
+ INCQ CX
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
|
|
|
|
+ MOVQ R10, CX
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ JB sequenceDecs_decode_56_bmi2_adjust_zero
|
|
|
|
+ JEQ sequenceDecs_decode_56_bmi2_adjust_one
|
|
|
|
+ CMPQ CX, $0x02
|
|
|
|
+ JA sequenceDecs_decode_56_bmi2_adjust_three
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_adjust_two
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_zero:
|
|
|
|
+ MOVQ R10, R13
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_one:
|
|
|
|
+ MOVQ R11, R13
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_two:
|
|
|
|
+ MOVQ R12, R13
|
|
|
|
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_three:
|
|
|
|
+ LEAQ -1(R10), R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
|
|
|
|
+ CMPQ CX, $0x01
|
|
|
|
+ CMOVQNE R11, R12
|
|
|
|
+ MOVQ R10, R11
|
|
|
|
+ MOVQ R13, R10
|
|
|
|
+ MOVQ R13, CX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_after_adjust:
|
|
|
|
+ MOVQ CX, 16(R9)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 8(R9), R13
|
|
|
|
+ MOVQ (R9), R14
|
|
|
|
+ LEAQ (R13)(R14*1), R15
|
|
|
|
+ MOVQ s+0(FP), BP
|
|
|
|
+ ADDQ R15, 256(BP)
|
|
|
|
+ MOVQ ctx+16(FP), R15
|
|
|
|
+ SUBQ R14, 128(R15)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ R13, $0x00020002
|
|
|
|
+ JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
|
|
|
|
+ ADDQ $0x18, R9
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ DECQ 96(CX)
|
|
|
|
+ JNS sequenceDecs_decode_56_bmi2_main_loop
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ R10, 144(CX)
|
|
|
|
+ MOVQ R11, 152(CX)
|
|
|
|
+ MOVQ R12, 160(CX)
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ AX, 32(CX)
|
|
|
|
+ MOVB DL, 40(CX)
|
|
|
|
+ MOVQ BX, 24(CX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
|
|
|
|
+// Requires: SSE
|
|
|
|
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
|
|
|
|
+ MOVQ ctx+0(FP), R10
|
|
|
|
+ MOVQ 8(R10), CX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ empty_seqs
|
|
|
|
+ MOVQ (R10), AX
|
|
|
|
+ MOVQ 24(R10), DX
|
|
|
|
+ MOVQ 32(R10), BX
|
|
|
|
+ MOVQ 80(R10), SI
|
|
|
|
+ MOVQ 104(R10), DI
|
|
|
|
+ MOVQ 120(R10), R8
|
|
|
|
+ MOVQ 56(R10), R9
|
|
|
|
+ MOVQ 64(R10), R10
|
|
|
|
+ ADDQ R10, R9
|
|
|
|
+
|
|
|
|
+ // seqsBase += 24 * seqIndex
|
|
|
|
+ LEAQ (DX)(DX*2), R11
|
|
|
|
+ SHLQ $0x03, R11
|
|
|
|
+ ADDQ R11, AX
|
|
|
|
+
|
|
|
|
+ // outBase += outPosition
|
|
|
|
+ ADDQ DI, BX
|
|
|
|
+
|
|
|
|
+main_loop:
|
|
|
|
+ MOVQ (AX), R11
|
|
|
|
+ MOVQ 16(AX), R12
|
|
|
|
+ MOVQ 8(AX), R13
|
|
|
|
+
|
|
|
|
+ // Copy literals
|
|
|
|
+ TESTQ R11, R11
|
|
|
|
+ JZ check_offset
|
|
|
|
+ XORQ R14, R14
|
|
|
|
+
|
|
|
|
+copy_1:
|
|
|
|
+ MOVUPS (SI)(R14*1), X0
|
|
|
|
+ MOVUPS X0, (BX)(R14*1)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ CMPQ R14, R11
|
|
|
|
+ JB copy_1
|
|
|
|
+ ADDQ R11, SI
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ ADDQ R11, DI
|
|
|
|
+
|
|
|
|
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
|
|
+check_offset:
|
|
|
|
+ LEAQ (DI)(R10*1), R11
|
|
|
|
+ CMPQ R12, R11
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+ CMPQ R12, R8
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+
|
|
|
|
+ // Copy match from history
|
|
|
|
+ MOVQ R12, R11
|
|
|
|
+ SUBQ DI, R11
|
|
|
|
+ JLS copy_match
|
|
|
|
+ MOVQ R9, R14
|
|
|
|
+ SUBQ R11, R14
|
|
|
|
+ CMPQ R13, R11
|
|
|
|
+ JG copy_all_from_history
|
|
|
|
+ MOVQ R13, R11
|
|
|
|
+ SUBQ $0x10, R11
|
|
|
|
+ JB copy_4_small
|
|
|
|
+
|
|
|
|
+copy_4_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (BX)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, BX
|
|
|
|
+ SUBQ $0x10, R11
|
|
|
|
+ JAE copy_4_loop
|
|
|
|
+ LEAQ 16(R14)(R11*1), R14
|
|
|
|
+ LEAQ 16(BX)(R11*1), BX
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(BX)
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_4_move_3
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_4_move_4through7
|
|
|
|
+ JMP copy_4_move_8through16
|
|
|
|
+
|
|
|
|
+copy_4_move_3:
|
|
|
|
+ MOVW (R14), R11
|
|
|
|
+ MOVB 2(R14), R12
|
|
|
|
+ MOVW R11, (BX)
|
|
|
|
+ MOVB R12, 2(BX)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_4through7:
|
|
|
|
+ MOVL (R14), R11
|
|
|
|
+ MOVL -4(R14)(R13*1), R12
|
|
|
|
+ MOVL R11, (BX)
|
|
|
|
+ MOVL R12, -4(BX)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_8through16:
|
|
|
|
+ MOVQ (R14), R11
|
|
|
|
+ MOVQ -8(R14)(R13*1), R12
|
|
|
|
+ MOVQ R11, (BX)
|
|
|
|
+ MOVQ R12, -8(BX)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+
|
|
|
|
+copy_4_end:
|
|
|
|
+ ADDQ R13, DI
|
|
|
|
+ ADDQ $0x18, AX
|
|
|
|
+ INCQ DX
|
|
|
|
+ CMPQ DX, CX
|
|
|
|
+ JB main_loop
|
|
|
|
+ JMP loop_finished
|
|
|
|
+
|
|
|
|
+copy_all_from_history:
|
|
|
|
+ MOVQ R11, R15
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JB copy_5_small
|
|
|
|
+
|
|
|
|
+copy_5_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (BX)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, BX
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JAE copy_5_loop
|
|
|
|
+ LEAQ 16(R14)(R15*1), R14
|
|
|
|
+ LEAQ 16(BX)(R15*1), BX
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(BX)
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_small:
|
|
|
|
+ CMPQ R11, $0x03
|
|
|
|
+ JE copy_5_move_3
|
|
|
|
+ JB copy_5_move_1or2
|
|
|
|
+ CMPQ R11, $0x08
|
|
|
|
+ JB copy_5_move_4through7
|
|
|
|
+ JMP copy_5_move_8through16
|
|
|
|
+
|
|
|
|
+copy_5_move_1or2:
|
|
|
|
+ MOVB (R14), R15
|
|
|
|
+ MOVB -1(R14)(R11*1), BP
|
|
|
|
+ MOVB R15, (BX)
|
|
|
|
+ MOVB BP, -1(BX)(R11*1)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_3:
|
|
|
|
+ MOVW (R14), R15
|
|
|
|
+ MOVB 2(R14), BP
|
|
|
|
+ MOVW R15, (BX)
|
|
|
|
+ MOVB BP, 2(BX)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_4through7:
|
|
|
|
+ MOVL (R14), R15
|
|
|
|
+ MOVL -4(R14)(R11*1), BP
|
|
|
|
+ MOVL R15, (BX)
|
|
|
|
+ MOVL BP, -4(BX)(R11*1)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_8through16:
|
|
|
|
+ MOVQ (R14), R15
|
|
|
|
+ MOVQ -8(R14)(R11*1), BP
|
|
|
|
+ MOVQ R15, (BX)
|
|
|
|
+ MOVQ BP, -8(BX)(R11*1)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+
|
|
|
|
+copy_5_end:
|
|
|
|
+ ADDQ R11, DI
|
|
|
|
+ SUBQ R11, R13
|
|
|
|
+
|
|
|
|
+ // Copy match from the current buffer
|
|
|
|
+copy_match:
|
|
|
|
+ MOVQ BX, R11
|
|
|
|
+ SUBQ R12, R11
|
|
|
|
+
|
|
|
|
+ // ml <= mo
|
|
|
|
+ CMPQ R13, R12
|
|
|
|
+ JA copy_overlapping_match
|
|
|
|
+
|
|
|
|
+ // Copy non-overlapping match
|
|
|
|
+ ADDQ R13, DI
|
|
|
|
+ MOVQ BX, R12
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+
|
|
|
|
+copy_2:
|
|
|
|
+ MOVUPS (R11), X0
|
|
|
|
+ MOVUPS X0, (R12)
|
|
|
|
+ ADDQ $0x10, R11
|
|
|
|
+ ADDQ $0x10, R12
|
|
|
|
+ SUBQ $0x10, R13
|
|
|
|
+ JHI copy_2
|
|
|
|
+ JMP handle_loop
|
|
|
|
+
|
|
|
|
+ // Copy overlapping match
|
|
|
|
+copy_overlapping_match:
|
|
|
|
+ ADDQ R13, DI
|
|
|
|
+
|
|
|
|
+copy_slow_3:
|
|
|
|
+ MOVB (R11), R12
|
|
|
|
+ MOVB R12, (BX)
|
|
|
|
+ INCQ R11
|
|
|
|
+ INCQ BX
|
|
|
|
+ DECQ R13
|
|
|
|
+ JNZ copy_slow_3
|
|
|
|
+
|
|
|
|
+handle_loop:
|
|
|
|
+ ADDQ $0x18, AX
|
|
|
|
+ INCQ DX
|
|
|
|
+ CMPQ DX, CX
|
|
|
|
+ JB main_loop
|
|
|
|
+
|
|
|
|
+loop_finished:
|
|
|
|
+ // Return value
|
|
|
|
+ MOVB $0x01, ret+8(FP)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+0(FP), AX
|
|
|
|
+ MOVQ DX, 24(AX)
|
|
|
|
+ MOVQ DI, 104(AX)
|
|
|
|
+ MOVQ 80(AX), CX
|
|
|
|
+ SUBQ CX, SI
|
|
|
|
+ MOVQ SI, 112(AX)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+error_match_off_too_big:
|
|
|
|
+ // Return value
|
|
|
|
+ MOVB $0x00, ret+8(FP)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+0(FP), AX
|
|
|
|
+ MOVQ DX, 24(AX)
|
|
|
|
+ MOVQ DI, 104(AX)
|
|
|
|
+ MOVQ 80(AX), CX
|
|
|
|
+ SUBQ CX, SI
|
|
|
|
+ MOVQ SI, 112(AX)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+empty_seqs:
|
|
|
|
+ // Return value
|
|
|
|
+ MOVB $0x01, ret+8(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
|
|
|
|
+// Requires: SSE
|
|
|
|
+TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
|
|
|
|
+ MOVQ ctx+0(FP), R10
|
|
|
|
+ MOVQ 8(R10), CX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ empty_seqs
|
|
|
|
+ MOVQ (R10), AX
|
|
|
|
+ MOVQ 24(R10), DX
|
|
|
|
+ MOVQ 32(R10), BX
|
|
|
|
+ MOVQ 80(R10), SI
|
|
|
|
+ MOVQ 104(R10), DI
|
|
|
|
+ MOVQ 120(R10), R8
|
|
|
|
+ MOVQ 56(R10), R9
|
|
|
|
+ MOVQ 64(R10), R10
|
|
|
|
+ ADDQ R10, R9
|
|
|
|
+
|
|
|
|
+ // seqsBase += 24 * seqIndex
|
|
|
|
+ LEAQ (DX)(DX*2), R11
|
|
|
|
+ SHLQ $0x03, R11
|
|
|
|
+ ADDQ R11, AX
|
|
|
|
+
|
|
|
|
+ // outBase += outPosition
|
|
|
|
+ ADDQ DI, BX
|
|
|
|
+
|
|
|
|
+main_loop:
|
|
|
|
+ MOVQ (AX), R11
|
|
|
|
+ MOVQ 16(AX), R12
|
|
|
|
+ MOVQ 8(AX), R13
|
|
|
|
+
|
|
|
|
+ // Copy literals
|
|
|
|
+ TESTQ R11, R11
|
|
|
|
+ JZ check_offset
|
|
|
|
+ MOVQ R11, R14
|
|
|
|
+ SUBQ $0x10, R14
|
|
|
|
+ JB copy_1_small
|
|
|
|
+
|
|
|
|
+copy_1_loop:
|
|
|
|
+ MOVUPS (SI), X0
|
|
|
|
+ MOVUPS X0, (BX)
|
|
|
|
+ ADDQ $0x10, SI
|
|
|
|
+ ADDQ $0x10, BX
|
|
|
|
+ SUBQ $0x10, R14
|
|
|
|
+ JAE copy_1_loop
|
|
|
|
+ LEAQ 16(SI)(R14*1), SI
|
|
|
|
+ LEAQ 16(BX)(R14*1), BX
|
|
|
|
+ MOVUPS -16(SI), X0
|
|
|
|
+ MOVUPS X0, -16(BX)
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_small:
|
|
|
|
+ CMPQ R11, $0x03
|
|
|
|
+ JE copy_1_move_3
|
|
|
|
+ JB copy_1_move_1or2
|
|
|
|
+ CMPQ R11, $0x08
|
|
|
|
+ JB copy_1_move_4through7
|
|
|
|
+ JMP copy_1_move_8through16
|
|
|
|
+
|
|
|
|
+copy_1_move_1or2:
|
|
|
|
+ MOVB (SI), R14
|
|
|
|
+ MOVB -1(SI)(R11*1), R15
|
|
|
|
+ MOVB R14, (BX)
|
|
|
|
+ MOVB R15, -1(BX)(R11*1)
|
|
|
|
+ ADDQ R11, SI
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_3:
|
|
|
|
+ MOVW (SI), R14
|
|
|
|
+ MOVB 2(SI), R15
|
|
|
|
+ MOVW R14, (BX)
|
|
|
|
+ MOVB R15, 2(BX)
|
|
|
|
+ ADDQ R11, SI
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_4through7:
|
|
|
|
+ MOVL (SI), R14
|
|
|
|
+ MOVL -4(SI)(R11*1), R15
|
|
|
|
+ MOVL R14, (BX)
|
|
|
|
+ MOVL R15, -4(BX)(R11*1)
|
|
|
|
+ ADDQ R11, SI
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_8through16:
|
|
|
|
+ MOVQ (SI), R14
|
|
|
|
+ MOVQ -8(SI)(R11*1), R15
|
|
|
|
+ MOVQ R14, (BX)
|
|
|
|
+ MOVQ R15, -8(BX)(R11*1)
|
|
|
|
+ ADDQ R11, SI
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+
|
|
|
|
+copy_1_end:
|
|
|
|
+ ADDQ R11, DI
|
|
|
|
+
|
|
|
|
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
|
|
+check_offset:
|
|
|
|
+ LEAQ (DI)(R10*1), R11
|
|
|
|
+ CMPQ R12, R11
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+ CMPQ R12, R8
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+
|
|
|
|
+ // Copy match from history
|
|
|
|
+ MOVQ R12, R11
|
|
|
|
+ SUBQ DI, R11
|
|
|
|
+ JLS copy_match
|
|
|
|
+ MOVQ R9, R14
|
|
|
|
+ SUBQ R11, R14
|
|
|
|
+ CMPQ R13, R11
|
|
|
|
+ JG copy_all_from_history
|
|
|
|
+ MOVQ R13, R11
|
|
|
|
+ SUBQ $0x10, R11
|
|
|
|
+ JB copy_4_small
|
|
|
|
+
|
|
|
|
+copy_4_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (BX)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, BX
|
|
|
|
+ SUBQ $0x10, R11
|
|
|
|
+ JAE copy_4_loop
|
|
|
|
+ LEAQ 16(R14)(R11*1), R14
|
|
|
|
+ LEAQ 16(BX)(R11*1), BX
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(BX)
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_4_move_3
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_4_move_4through7
|
|
|
|
+ JMP copy_4_move_8through16
|
|
|
|
+
|
|
|
|
+copy_4_move_3:
|
|
|
|
+ MOVW (R14), R11
|
|
|
|
+ MOVB 2(R14), R12
|
|
|
|
+ MOVW R11, (BX)
|
|
|
|
+ MOVB R12, 2(BX)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_4through7:
|
|
|
|
+ MOVL (R14), R11
|
|
|
|
+ MOVL -4(R14)(R13*1), R12
|
|
|
|
+ MOVL R11, (BX)
|
|
|
|
+ MOVL R12, -4(BX)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_8through16:
|
|
|
|
+ MOVQ (R14), R11
|
|
|
|
+ MOVQ -8(R14)(R13*1), R12
|
|
|
|
+ MOVQ R11, (BX)
|
|
|
|
+ MOVQ R12, -8(BX)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+
|
|
|
|
+copy_4_end:
|
|
|
|
+ ADDQ R13, DI
|
|
|
|
+ ADDQ $0x18, AX
|
|
|
|
+ INCQ DX
|
|
|
|
+ CMPQ DX, CX
|
|
|
|
+ JB main_loop
|
|
|
|
+ JMP loop_finished
|
|
|
|
+
|
|
|
|
+copy_all_from_history:
|
|
|
|
+ MOVQ R11, R15
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JB copy_5_small
|
|
|
|
+
|
|
|
|
+copy_5_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (BX)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, BX
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JAE copy_5_loop
|
|
|
|
+ LEAQ 16(R14)(R15*1), R14
|
|
|
|
+ LEAQ 16(BX)(R15*1), BX
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(BX)
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_small:
|
|
|
|
+ CMPQ R11, $0x03
|
|
|
|
+ JE copy_5_move_3
|
|
|
|
+ JB copy_5_move_1or2
|
|
|
|
+ CMPQ R11, $0x08
|
|
|
|
+ JB copy_5_move_4through7
|
|
|
|
+ JMP copy_5_move_8through16
|
|
|
|
+
|
|
|
|
+copy_5_move_1or2:
|
|
|
|
+ MOVB (R14), R15
|
|
|
|
+ MOVB -1(R14)(R11*1), BP
|
|
|
|
+ MOVB R15, (BX)
|
|
|
|
+ MOVB BP, -1(BX)(R11*1)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_3:
|
|
|
|
+ MOVW (R14), R15
|
|
|
|
+ MOVB 2(R14), BP
|
|
|
|
+ MOVW R15, (BX)
|
|
|
|
+ MOVB BP, 2(BX)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_4through7:
|
|
|
|
+ MOVL (R14), R15
|
|
|
|
+ MOVL -4(R14)(R11*1), BP
|
|
|
|
+ MOVL R15, (BX)
|
|
|
|
+ MOVL BP, -4(BX)(R11*1)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_8through16:
|
|
|
|
+ MOVQ (R14), R15
|
|
|
|
+ MOVQ -8(R14)(R11*1), BP
|
|
|
|
+ MOVQ R15, (BX)
|
|
|
|
+ MOVQ BP, -8(BX)(R11*1)
|
|
|
|
+ ADDQ R11, R14
|
|
|
|
+ ADDQ R11, BX
|
|
|
|
+
|
|
|
|
+copy_5_end:
|
|
|
|
+ ADDQ R11, DI
|
|
|
|
+ SUBQ R11, R13
|
|
|
|
+
|
|
|
|
+ // Copy match from the current buffer
|
|
|
|
+copy_match:
|
|
|
|
+ MOVQ BX, R11
|
|
|
|
+ SUBQ R12, R11
|
|
|
|
+
|
|
|
|
+ // ml <= mo
|
|
|
|
+ CMPQ R13, R12
|
|
|
|
+ JA copy_overlapping_match
|
|
|
|
+
|
|
|
|
+ // Copy non-overlapping match
|
|
|
|
+ ADDQ R13, DI
|
|
|
|
+ MOVQ R13, R12
|
|
|
|
+ SUBQ $0x10, R12
|
|
|
|
+ JB copy_2_small
|
|
|
|
+
|
|
|
|
+copy_2_loop:
|
|
|
|
+ MOVUPS (R11), X0
|
|
|
|
+ MOVUPS X0, (BX)
|
|
|
|
+ ADDQ $0x10, R11
|
|
|
|
+ ADDQ $0x10, BX
|
|
|
|
+ SUBQ $0x10, R12
|
|
|
|
+ JAE copy_2_loop
|
|
|
|
+ LEAQ 16(R11)(R12*1), R11
|
|
|
|
+ LEAQ 16(BX)(R12*1), BX
|
|
|
|
+ MOVUPS -16(R11), X0
|
|
|
|
+ MOVUPS X0, -16(BX)
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_2_move_3
|
|
|
|
+ JB copy_2_move_1or2
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_2_move_4through7
|
|
|
|
+ JMP copy_2_move_8through16
|
|
|
|
+
|
|
|
|
+copy_2_move_1or2:
|
|
|
|
+ MOVB (R11), R12
|
|
|
|
+ MOVB -1(R11)(R13*1), R14
|
|
|
|
+ MOVB R12, (BX)
|
|
|
|
+ MOVB R14, -1(BX)(R13*1)
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_3:
|
|
|
|
+ MOVW (R11), R12
|
|
|
|
+ MOVB 2(R11), R14
|
|
|
|
+ MOVW R12, (BX)
|
|
|
|
+ MOVB R14, 2(BX)
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_4through7:
|
|
|
|
+ MOVL (R11), R12
|
|
|
|
+ MOVL -4(R11)(R13*1), R14
|
|
|
|
+ MOVL R12, (BX)
|
|
|
|
+ MOVL R14, -4(BX)(R13*1)
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_8through16:
|
|
|
|
+ MOVQ (R11), R12
|
|
|
|
+ MOVQ -8(R11)(R13*1), R14
|
|
|
|
+ MOVQ R12, (BX)
|
|
|
|
+ MOVQ R14, -8(BX)(R13*1)
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ ADDQ R13, BX
|
|
|
|
+
|
|
|
|
+copy_2_end:
|
|
|
|
+ JMP handle_loop
|
|
|
|
+
|
|
|
|
+ // Copy overlapping match
|
|
|
|
+copy_overlapping_match:
|
|
|
|
+ ADDQ R13, DI
|
|
|
|
+
|
|
|
|
+copy_slow_3:
|
|
|
|
+ MOVB (R11), R12
|
|
|
|
+ MOVB R12, (BX)
|
|
|
|
+ INCQ R11
|
|
|
|
+ INCQ BX
|
|
|
|
+ DECQ R13
|
|
|
|
+ JNZ copy_slow_3
|
|
|
|
+
|
|
|
|
+handle_loop:
|
|
|
|
+ ADDQ $0x18, AX
|
|
|
|
+ INCQ DX
|
|
|
|
+ CMPQ DX, CX
|
|
|
|
+ JB main_loop
|
|
|
|
+
|
|
|
|
+loop_finished:
|
|
|
|
+ // Return value
|
|
|
|
+ MOVB $0x01, ret+8(FP)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+0(FP), AX
|
|
|
|
+ MOVQ DX, 24(AX)
|
|
|
|
+ MOVQ DI, 104(AX)
|
|
|
|
+ MOVQ 80(AX), CX
|
|
|
|
+ SUBQ CX, SI
|
|
|
|
+ MOVQ SI, 112(AX)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+error_match_off_too_big:
|
|
|
|
+ // Return value
|
|
|
|
+ MOVB $0x00, ret+8(FP)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+0(FP), AX
|
|
|
|
+ MOVQ DX, 24(AX)
|
|
|
|
+ MOVQ DI, 104(AX)
|
|
|
|
+ MOVQ 80(AX), CX
|
|
|
|
+ SUBQ CX, SI
|
|
|
|
+ MOVQ SI, 112(AX)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+empty_seqs:
|
|
|
|
+ // Return value
|
|
|
|
+ MOVB $0x01, ret+8(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
|
|
+// Requires: CMOV, SSE
|
|
|
|
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ 32(AX), DX
|
|
|
|
+ MOVBQZX 40(AX), BX
|
|
|
|
+ MOVQ 24(AX), SI
|
|
|
|
+ MOVQ (AX), AX
|
|
|
|
+ ADDQ SI, AX
|
|
|
|
+ MOVQ AX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 72(AX), DI
|
|
|
|
+ MOVQ 80(AX), R8
|
|
|
|
+ MOVQ 88(AX), R9
|
|
|
|
+ XORQ CX, CX
|
|
|
|
+ MOVQ CX, 8(SP)
|
|
|
|
+ MOVQ CX, 16(SP)
|
|
|
|
+ MOVQ CX, 24(SP)
|
|
|
|
+ MOVQ 112(AX), R10
|
|
|
|
+ MOVQ 128(AX), CX
|
|
|
|
+ MOVQ CX, 32(SP)
|
|
|
|
+ MOVQ 144(AX), R11
|
|
|
|
+ MOVQ 136(AX), R12
|
|
|
|
+ MOVQ 200(AX), CX
|
|
|
|
+ MOVQ CX, 56(SP)
|
|
|
|
+ MOVQ 176(AX), CX
|
|
|
|
+ MOVQ CX, 48(SP)
|
|
|
|
+ MOVQ 184(AX), AX
|
|
|
|
+ MOVQ AX, 40(SP)
|
|
|
|
+ MOVQ 40(SP), AX
|
|
|
|
+ ADDQ AX, 48(SP)
|
|
|
|
+
|
|
|
|
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
|
|
+ ADDQ R10, 32(SP)
|
|
|
|
+
|
|
|
|
+ // outBase += outPosition
|
|
|
|
+ ADDQ R12, R10
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_main_loop:
|
|
|
|
+ MOVQ (SP), R13
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ SI, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
|
|
|
|
+ MOVQ BX, AX
|
|
|
|
+ SHRQ $0x03, AX
|
|
|
|
+ SUBQ AX, R13
|
|
|
|
+ MOVQ (R13), DX
|
|
|
|
+ SUBQ AX, SI
|
|
|
|
+ ANDQ $0x07, BX
|
|
|
|
+ JMP sequenceDecs_decodeSync_amd64_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
|
|
|
|
+ CMPQ SI, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_amd64_fill_end
|
|
|
|
+ CMPQ BX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_amd64_fill_end
|
|
|
|
+ SHLQ $0x08, DX
|
|
|
|
+ SUBQ $0x01, R13
|
|
|
|
+ SUBQ $0x01, SI
|
|
|
|
+ SUBQ $0x08, BX
|
|
|
|
+ MOVBQZX (R13), AX
|
|
|
|
+ ORQ AX, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ SHLQ CL, R14
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decodeSync_amd64_of_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decodeSync_amd64_of_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decodeSync_amd64_of_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R14
|
|
|
|
+ ADDQ R14, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_of_update_zero:
|
|
|
|
+ MOVQ AX, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ R8, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ SHLQ CL, R14
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R14
|
|
|
|
+ ADDQ R14, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_ml_update_zero:
|
|
|
|
+ MOVQ AX, 16(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the remaining
|
|
|
|
+ CMPQ SI, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
|
|
|
|
+ MOVQ BX, AX
|
|
|
|
+ SHRQ $0x03, AX
|
|
|
|
+ SUBQ AX, R13
|
|
|
|
+ MOVQ (R13), DX
|
|
|
|
+ SUBQ AX, SI
|
|
|
|
+ ANDQ $0x07, BX
|
|
|
|
+ JMP sequenceDecs_decodeSync_amd64_fill_2_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
|
|
|
|
+ CMPQ SI, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_amd64_fill_2_end
|
|
|
|
+ CMPQ BX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_amd64_fill_2_end
|
|
|
|
+ SHLQ $0x08, DX
|
|
|
|
+ SUBQ $0x01, R13
|
|
|
|
+ SUBQ $0x01, SI
|
|
|
|
+ SUBQ $0x08, BX
|
|
|
|
+ MOVBQZX (R13), AX
|
|
|
|
+ ORQ AX, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_fill_2_end:
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ DI, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ SHLQ CL, R14
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R14
|
|
|
|
+ ADDQ R14, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_ll_update_zero:
|
|
|
|
+ MOVQ AX, 24(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R13, (SP)
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ SHRQ $0x08, AX
|
|
|
|
+ MOVBQZX AL, AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decodeSync_amd64_skip_update
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ MOVBQZX DI, R13
|
|
|
|
+ SHRQ $0x10, DI
|
|
|
|
+ MOVWQZX DI, DI
|
|
|
|
+ LEAQ (BX)(R13*1), CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ MOVL $0x00000001, R15
|
|
|
|
+ MOVB R13, CL
|
|
|
|
+ SHLL CL, R15
|
|
|
|
+ DECL R15
|
|
|
|
+ ANDQ R15, R14
|
|
|
|
+ ADDQ R14, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ MOVBQZX R8, R13
|
|
|
|
+ SHRQ $0x10, R8
|
|
|
|
+ MOVWQZX R8, R8
|
|
|
|
+ LEAQ (BX)(R13*1), CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ MOVL $0x00000001, R15
|
|
|
|
+ MOVB R13, CL
|
|
|
|
+ SHLL CL, R15
|
|
|
|
+ DECL R15
|
|
|
|
+ ANDQ R15, R14
|
|
|
|
+ ADDQ R14, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ MOVBQZX R9, R13
|
|
|
|
+ SHRQ $0x10, R9
|
|
|
|
+ MOVWQZX R9, R9
|
|
|
|
+ LEAQ (BX)(R13*1), CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ MOVL $0x00000001, R15
|
|
|
|
+ MOVB R13, CL
|
|
|
|
+ SHLL CL, R15
|
|
|
|
+ DECL R15
|
|
|
|
+ ANDQ R15, R14
|
|
|
|
+ ADDQ R14, R9
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R9*8), R9
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ 8(SP), R13
|
|
|
|
+ CMPQ AX, $0x01
|
|
|
|
+ JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
|
|
|
|
+ MOVUPS 144(CX), X0
|
|
|
|
+ MOVQ R13, 144(CX)
|
|
|
|
+ MOVUPS X0, 152(CX)
|
|
|
|
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ 24(SP), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
|
|
|
|
+ INCQ R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
|
|
|
|
+ MOVQ 144(CX), R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
|
|
|
|
+ MOVQ R13, AX
|
|
|
|
+ XORQ R14, R14
|
|
|
|
+ MOVQ $-1, R15
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ CMOVQEQ R14, AX
|
|
|
|
+ CMOVQEQ R15, R14
|
|
|
|
+ ADDQ 144(CX)(AX*8), R14
|
|
|
|
+ JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, R14
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
|
|
|
|
+ CMPQ R13, $0x01
|
|
|
|
+ JZ sequenceDecs_decodeSync_amd64_adjust_skip
|
|
|
|
+ MOVQ 152(CX), AX
|
|
|
|
+ MOVQ AX, 160(CX)
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_adjust_skip:
|
|
|
|
+ MOVQ 144(CX), AX
|
|
|
|
+ MOVQ AX, 152(CX)
|
|
|
|
+ MOVQ R14, 144(CX)
|
|
|
|
+ MOVQ R14, R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_after_adjust:
|
|
|
|
+ MOVQ R13, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 16(SP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ LEAQ (AX)(CX*1), R14
|
|
|
|
+ MOVQ s+0(FP), R15
|
|
|
|
+ ADDQ R14, 256(R15)
|
|
|
|
+ MOVQ ctx+16(FP), R14
|
|
|
|
+ SUBQ CX, 104(R14)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ AX, $0x00020002
|
|
|
|
+ JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
|
|
|
|
+ MOVQ 24(SP), AX
|
|
|
|
+ MOVQ 8(SP), CX
|
|
|
|
+ MOVQ 16(SP), R13
|
|
|
|
+
|
|
|
|
+ // Check if we have enough space in s.out
|
|
|
|
+ LEAQ (AX)(R13*1), R14
|
|
|
|
+ ADDQ R10, R14
|
|
|
|
+ CMPQ R14, 32(SP)
|
|
|
|
+ JA error_not_enough_space
|
|
|
|
+
|
|
|
|
+ // Copy literals
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JZ check_offset
|
|
|
|
+ XORQ R14, R14
|
|
|
|
+
|
|
|
|
+copy_1:
|
|
|
|
+ MOVUPS (R11)(R14*1), X0
|
|
|
|
+ MOVUPS X0, (R10)(R14*1)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ CMPQ R14, AX
|
|
|
|
+ JB copy_1
|
|
|
|
+ ADDQ AX, R11
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ ADDQ AX, R12
|
|
|
|
+
|
|
|
|
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
|
|
+check_offset:
|
|
|
|
+ MOVQ R12, AX
|
|
|
|
+ ADDQ 40(SP), AX
|
|
|
|
+ CMPQ CX, AX
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+ CMPQ CX, 56(SP)
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+
|
|
|
|
+ // Copy match from history
|
|
|
|
+ MOVQ CX, AX
|
|
|
|
+ SUBQ R12, AX
|
|
|
|
+ JLS copy_match
|
|
|
|
+ MOVQ 48(SP), R14
|
|
|
|
+ SUBQ AX, R14
|
|
|
|
+ CMPQ R13, AX
|
|
|
|
+ JG copy_all_from_history
|
|
|
|
+ MOVQ R13, AX
|
|
|
|
+ SUBQ $0x10, AX
|
|
|
|
+ JB copy_4_small
|
|
|
|
+
|
|
|
|
+copy_4_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R10)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R10
|
|
|
|
+ SUBQ $0x10, AX
|
|
|
|
+ JAE copy_4_loop
|
|
|
|
+ LEAQ 16(R14)(AX*1), R14
|
|
|
|
+ LEAQ 16(R10)(AX*1), R10
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R10)
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_4_move_3
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_4_move_4through7
|
|
|
|
+ JMP copy_4_move_8through16
|
|
|
|
+
|
|
|
|
+copy_4_move_3:
|
|
|
|
+ MOVW (R14), AX
|
|
|
|
+ MOVB 2(R14), CL
|
|
|
|
+ MOVW AX, (R10)
|
|
|
|
+ MOVB CL, 2(R10)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_4through7:
|
|
|
|
+ MOVL (R14), AX
|
|
|
|
+ MOVL -4(R14)(R13*1), CX
|
|
|
|
+ MOVL AX, (R10)
|
|
|
|
+ MOVL CX, -4(R10)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_8through16:
|
|
|
|
+ MOVQ (R14), AX
|
|
|
|
+ MOVQ -8(R14)(R13*1), CX
|
|
|
|
+ MOVQ AX, (R10)
|
|
|
|
+ MOVQ CX, -8(R10)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+
|
|
|
|
+copy_4_end:
|
|
|
|
+ ADDQ R13, R12
|
|
|
|
+ JMP handle_loop
|
|
|
|
+ JMP loop_finished
|
|
|
|
+
|
|
|
|
+copy_all_from_history:
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JB copy_5_small
|
|
|
|
+
|
|
|
|
+copy_5_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R10)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R10
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JAE copy_5_loop
|
|
|
|
+ LEAQ 16(R14)(R15*1), R14
|
|
|
|
+ LEAQ 16(R10)(R15*1), R10
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R10)
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_small:
|
|
|
|
+ CMPQ AX, $0x03
|
|
|
|
+ JE copy_5_move_3
|
|
|
|
+ JB copy_5_move_1or2
|
|
|
|
+ CMPQ AX, $0x08
|
|
|
|
+ JB copy_5_move_4through7
|
|
|
|
+ JMP copy_5_move_8through16
|
|
|
|
+
|
|
|
|
+copy_5_move_1or2:
|
|
|
|
+ MOVB (R14), R15
|
|
|
|
+ MOVB -1(R14)(AX*1), BP
|
|
|
|
+ MOVB R15, (R10)
|
|
|
|
+ MOVB BP, -1(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_3:
|
|
|
|
+ MOVW (R14), R15
|
|
|
|
+ MOVB 2(R14), BP
|
|
|
|
+ MOVW R15, (R10)
|
|
|
|
+ MOVB BP, 2(R10)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_4through7:
|
|
|
|
+ MOVL (R14), R15
|
|
|
|
+ MOVL -4(R14)(AX*1), BP
|
|
|
|
+ MOVL R15, (R10)
|
|
|
|
+ MOVL BP, -4(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_8through16:
|
|
|
|
+ MOVQ (R14), R15
|
|
|
|
+ MOVQ -8(R14)(AX*1), BP
|
|
|
|
+ MOVQ R15, (R10)
|
|
|
|
+ MOVQ BP, -8(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+
|
|
|
|
+copy_5_end:
|
|
|
|
+ ADDQ AX, R12
|
|
|
|
+ SUBQ AX, R13
|
|
|
|
+
|
|
|
|
+ // Copy match from the current buffer
|
|
|
|
+copy_match:
|
|
|
|
+ MOVQ R10, AX
|
|
|
|
+ SUBQ CX, AX
|
|
|
|
+
|
|
|
|
+ // ml <= mo
|
|
|
|
+ CMPQ R13, CX
|
|
|
|
+ JA copy_overlapping_match
|
|
|
|
+
|
|
|
|
+ // Copy non-overlapping match
|
|
|
|
+ ADDQ R13, R12
|
|
|
|
+ MOVQ R10, CX
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+
|
|
|
|
+copy_2:
|
|
|
|
+ MOVUPS (AX), X0
|
|
|
|
+ MOVUPS X0, (CX)
|
|
|
|
+ ADDQ $0x10, AX
|
|
|
|
+ ADDQ $0x10, CX
|
|
|
|
+ SUBQ $0x10, R13
|
|
|
|
+ JHI copy_2
|
|
|
|
+ JMP handle_loop
|
|
|
|
+
|
|
|
|
+ // Copy overlapping match
|
|
|
|
+copy_overlapping_match:
|
|
|
|
+ ADDQ R13, R12
|
|
|
|
+
|
|
|
|
+copy_slow_3:
|
|
|
|
+ MOVB (AX), CL
|
|
|
|
+ MOVB CL, (R10)
|
|
|
|
+ INCQ AX
|
|
|
|
+ INCQ R10
|
|
|
|
+ DECQ R13
|
|
|
|
+ JNZ copy_slow_3
|
|
|
|
+
|
|
|
|
+handle_loop:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ DECQ 96(AX)
|
|
|
|
+ JNS sequenceDecs_decodeSync_amd64_main_loop
|
|
|
|
+
|
|
|
|
+loop_finished:
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ DX, 32(AX)
|
|
|
|
+ MOVB BL, 40(AX)
|
|
|
|
+ MOVQ SI, 24(AX)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ R12, 136(AX)
|
|
|
|
+ MOVQ 144(AX), CX
|
|
|
|
+ SUBQ CX, R11
|
|
|
|
+ MOVQ R11, 168(AX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ 16(SP), AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ AX, 216(CX)
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+error_match_off_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 8(SP), CX
|
|
|
|
+ MOVQ CX, 224(AX)
|
|
|
|
+ MOVQ R12, 136(AX)
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+error_not_enough_space:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ R12, 136(AX)
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
|
|
+// Requires: BMI, BMI2, CMOV, SSE
|
|
|
|
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ 32(CX), AX
|
|
|
|
+ MOVBQZX 40(CX), DX
|
|
|
|
+ MOVQ 24(CX), BX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ ADDQ BX, CX
|
|
|
|
+ MOVQ CX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 72(CX), SI
|
|
|
|
+ MOVQ 80(CX), DI
|
|
|
|
+ MOVQ 88(CX), R8
|
|
|
|
+ XORQ R9, R9
|
|
|
|
+ MOVQ R9, 8(SP)
|
|
|
|
+ MOVQ R9, 16(SP)
|
|
|
|
+ MOVQ R9, 24(SP)
|
|
|
|
+ MOVQ 112(CX), R9
|
|
|
|
+ MOVQ 128(CX), R10
|
|
|
|
+ MOVQ R10, 32(SP)
|
|
|
|
+ MOVQ 144(CX), R10
|
|
|
|
+ MOVQ 136(CX), R11
|
|
|
|
+ MOVQ 200(CX), R12
|
|
|
|
+ MOVQ R12, 56(SP)
|
|
|
|
+ MOVQ 176(CX), R12
|
|
|
|
+ MOVQ R12, 48(SP)
|
|
|
|
+ MOVQ 184(CX), CX
|
|
|
|
+ MOVQ CX, 40(SP)
|
|
|
|
+ MOVQ 40(SP), CX
|
|
|
|
+ ADDQ CX, 48(SP)
|
|
|
|
+
|
|
|
|
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
|
|
+ ADDQ R9, 32(SP)
|
|
|
|
+
|
|
|
|
+ // outBase += outPosition
|
|
|
|
+ ADDQ R11, R9
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_main_loop:
|
|
|
|
+ MOVQ (SP), R12
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ BX, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
|
|
|
|
+ MOVQ DX, CX
|
|
|
|
+ SHRQ $0x03, CX
|
|
|
|
+ SUBQ CX, R12
|
|
|
|
+ MOVQ (R12), AX
|
|
|
|
+ SUBQ CX, BX
|
|
|
|
+ ANDQ $0x07, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_bmi2_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
|
|
|
|
+ CMPQ BX, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_bmi2_fill_end
|
|
|
|
+ CMPQ DX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_bmi2_fill_end
|
|
|
|
+ SHLQ $0x08, AX
|
|
|
|
+ SUBQ $0x01, R12
|
|
|
|
+ SUBQ $0x01, BX
|
|
|
|
+ SUBQ $0x08, DX
|
|
|
|
+ MOVBQZX (R12), CX
|
|
|
|
+ ORQ CX, AX
|
|
|
|
+ JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R13
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ R8, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R14, CX
|
|
|
|
+ MOVQ CX, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, DI, R13
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ DI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R14, CX
|
|
|
|
+ MOVQ CX, 16(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the remaining
|
|
|
|
+ CMPQ BX, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
|
|
|
|
+ MOVQ DX, CX
|
|
|
|
+ SHRQ $0x03, CX
|
|
|
|
+ SUBQ CX, R12
|
|
|
|
+ MOVQ (R12), AX
|
|
|
|
+ SUBQ CX, BX
|
|
|
|
+ ANDQ $0x07, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
|
|
|
|
+ CMPQ BX, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
|
|
+ CMPQ DX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
|
|
+ SHLQ $0x08, AX
|
|
|
|
+ SUBQ $0x01, R12
|
|
|
|
+ SUBQ $0x01, BX
|
|
|
|
+ SUBQ $0x08, DX
|
|
|
|
+ MOVBQZX (R12), CX
|
|
|
|
+ ORQ CX, AX
|
|
|
|
+ JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_fill_2_end:
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, SI, R13
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ SI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R14, CX
|
|
|
|
+ MOVQ CX, 24(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R12, (SP)
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R12
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decodeSync_bmi2_skip_update
|
|
|
|
+ LEAQ (SI)(DI*1), R13
|
|
|
|
+ ADDQ R8, R13
|
|
|
|
+ MOVBQZX R13, R13
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ BZHIQ R8, R14, CX
|
|
|
|
+ SHRXQ R8, R14, R14
|
|
|
|
+ MOVQ $0x00001010, R13
|
|
|
|
+ BEXTRQ R13, R8, R8
|
|
|
|
+ ADDQ CX, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ BZHIQ DI, R14, CX
|
|
|
|
+ SHRXQ DI, R14, R14
|
|
|
|
+ MOVQ $0x00001010, R13
|
|
|
|
+ BEXTRQ R13, DI, DI
|
|
|
|
+ ADDQ CX, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ BZHIQ SI, R14, CX
|
|
|
|
+ MOVQ $0x00001010, R13
|
|
|
|
+ BEXTRQ R13, SI, SI
|
|
|
|
+ ADDQ CX, SI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(SI*8), SI
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ 8(SP), R13
|
|
|
|
+ CMPQ R12, $0x01
|
|
|
|
+ JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
|
|
|
|
+ MOVUPS 144(CX), X0
|
|
|
|
+ MOVQ R13, 144(CX)
|
|
|
|
+ MOVUPS X0, 152(CX)
|
|
|
|
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ 24(SP), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
|
|
|
|
+ INCQ R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
|
|
|
|
+ MOVQ 144(CX), R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
|
|
|
|
+ MOVQ R13, R12
|
|
|
|
+ XORQ R14, R14
|
|
|
|
+ MOVQ $-1, R15
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ CMOVQEQ R14, R12
|
|
|
|
+ CMOVQEQ R15, R14
|
|
|
|
+ ADDQ 144(CX)(R12*8), R14
|
|
|
|
+ JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, R14
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
|
|
|
|
+ CMPQ R13, $0x01
|
|
|
|
+ JZ sequenceDecs_decodeSync_bmi2_adjust_skip
|
|
|
|
+ MOVQ 152(CX), R12
|
|
|
|
+ MOVQ R12, 160(CX)
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_adjust_skip:
|
|
|
|
+ MOVQ 144(CX), R12
|
|
|
|
+ MOVQ R12, 152(CX)
|
|
|
|
+ MOVQ R14, 144(CX)
|
|
|
|
+ MOVQ R14, R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_after_adjust:
|
|
|
|
+ MOVQ R13, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ 24(SP), R12
|
|
|
|
+ LEAQ (CX)(R12*1), R14
|
|
|
|
+ MOVQ s+0(FP), R15
|
|
|
|
+ ADDQ R14, 256(R15)
|
|
|
|
+ MOVQ ctx+16(FP), R14
|
|
|
|
+ SUBQ R12, 104(R14)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ CX, $0x00020002
|
|
|
|
+ JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ 8(SP), R12
|
|
|
|
+ MOVQ 16(SP), R13
|
|
|
|
+
|
|
|
|
+ // Check if we have enough space in s.out
|
|
|
|
+ LEAQ (CX)(R13*1), R14
|
|
|
|
+ ADDQ R9, R14
|
|
|
|
+ CMPQ R14, 32(SP)
|
|
|
|
+ JA error_not_enough_space
|
|
|
|
+
|
|
|
|
+ // Copy literals
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ check_offset
|
|
|
|
+ XORQ R14, R14
|
|
|
|
+
|
|
|
|
+copy_1:
|
|
|
|
+ MOVUPS (R10)(R14*1), X0
|
|
|
|
+ MOVUPS X0, (R9)(R14*1)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ CMPQ R14, CX
|
|
|
|
+ JB copy_1
|
|
|
|
+ ADDQ CX, R10
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ ADDQ CX, R11
|
|
|
|
+
|
|
|
|
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
|
|
+check_offset:
|
|
|
|
+ MOVQ R11, CX
|
|
|
|
+ ADDQ 40(SP), CX
|
|
|
|
+ CMPQ R12, CX
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+ CMPQ R12, 56(SP)
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+
|
|
|
|
+ // Copy match from history
|
|
|
|
+ MOVQ R12, CX
|
|
|
|
+ SUBQ R11, CX
|
|
|
|
+ JLS copy_match
|
|
|
|
+ MOVQ 48(SP), R14
|
|
|
|
+ SUBQ CX, R14
|
|
|
|
+ CMPQ R13, CX
|
|
|
|
+ JG copy_all_from_history
|
|
|
|
+ MOVQ R13, CX
|
|
|
|
+ SUBQ $0x10, CX
|
|
|
|
+ JB copy_4_small
|
|
|
|
+
|
|
|
|
+copy_4_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R9)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R9
|
|
|
|
+ SUBQ $0x10, CX
|
|
|
|
+ JAE copy_4_loop
|
|
|
|
+ LEAQ 16(R14)(CX*1), R14
|
|
|
|
+ LEAQ 16(R9)(CX*1), R9
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R9)
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_4_move_3
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_4_move_4through7
|
|
|
|
+ JMP copy_4_move_8through16
|
|
|
|
+
|
|
|
|
+copy_4_move_3:
|
|
|
|
+ MOVW (R14), CX
|
|
|
|
+ MOVB 2(R14), R12
|
|
|
|
+ MOVW CX, (R9)
|
|
|
|
+ MOVB R12, 2(R9)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_4through7:
|
|
|
|
+ MOVL (R14), CX
|
|
|
|
+ MOVL -4(R14)(R13*1), R12
|
|
|
|
+ MOVL CX, (R9)
|
|
|
|
+ MOVL R12, -4(R9)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_8through16:
|
|
|
|
+ MOVQ (R14), CX
|
|
|
|
+ MOVQ -8(R14)(R13*1), R12
|
|
|
|
+ MOVQ CX, (R9)
|
|
|
|
+ MOVQ R12, -8(R9)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+
|
|
|
|
+copy_4_end:
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ JMP handle_loop
|
|
|
|
+ JMP loop_finished
|
|
|
|
+
|
|
|
|
+copy_all_from_history:
|
|
|
|
+ MOVQ CX, R15
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JB copy_5_small
|
|
|
|
+
|
|
|
|
+copy_5_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R9)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R9
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JAE copy_5_loop
|
|
|
|
+ LEAQ 16(R14)(R15*1), R14
|
|
|
|
+ LEAQ 16(R9)(R15*1), R9
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R9)
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_small:
|
|
|
|
+ CMPQ CX, $0x03
|
|
|
|
+ JE copy_5_move_3
|
|
|
|
+ JB copy_5_move_1or2
|
|
|
|
+ CMPQ CX, $0x08
|
|
|
|
+ JB copy_5_move_4through7
|
|
|
|
+ JMP copy_5_move_8through16
|
|
|
|
+
|
|
|
|
+copy_5_move_1or2:
|
|
|
|
+ MOVB (R14), R15
|
|
|
|
+ MOVB -1(R14)(CX*1), BP
|
|
|
|
+ MOVB R15, (R9)
|
|
|
|
+ MOVB BP, -1(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_3:
|
|
|
|
+ MOVW (R14), R15
|
|
|
|
+ MOVB 2(R14), BP
|
|
|
|
+ MOVW R15, (R9)
|
|
|
|
+ MOVB BP, 2(R9)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_4through7:
|
|
|
|
+ MOVL (R14), R15
|
|
|
|
+ MOVL -4(R14)(CX*1), BP
|
|
|
|
+ MOVL R15, (R9)
|
|
|
|
+ MOVL BP, -4(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_8through16:
|
|
|
|
+ MOVQ (R14), R15
|
|
|
|
+ MOVQ -8(R14)(CX*1), BP
|
|
|
|
+ MOVQ R15, (R9)
|
|
|
|
+ MOVQ BP, -8(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+
|
|
|
|
+copy_5_end:
|
|
|
|
+ ADDQ CX, R11
|
|
|
|
+ SUBQ CX, R13
|
|
|
|
+
|
|
|
|
+ // Copy match from the current buffer
|
|
|
|
+copy_match:
|
|
|
|
+ MOVQ R9, CX
|
|
|
|
+ SUBQ R12, CX
|
|
|
|
+
|
|
|
|
+ // ml <= mo
|
|
|
|
+ CMPQ R13, R12
|
|
|
|
+ JA copy_overlapping_match
|
|
|
|
+
|
|
|
|
+ // Copy non-overlapping match
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ MOVQ R9, R12
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+
|
|
|
|
+copy_2:
|
|
|
|
+ MOVUPS (CX), X0
|
|
|
|
+ MOVUPS X0, (R12)
|
|
|
|
+ ADDQ $0x10, CX
|
|
|
|
+ ADDQ $0x10, R12
|
|
|
|
+ SUBQ $0x10, R13
|
|
|
|
+ JHI copy_2
|
|
|
|
+ JMP handle_loop
|
|
|
|
+
|
|
|
|
+ // Copy overlapping match
|
|
|
|
+copy_overlapping_match:
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+
|
|
|
|
+copy_slow_3:
|
|
|
|
+ MOVB (CX), R12
|
|
|
|
+ MOVB R12, (R9)
|
|
|
|
+ INCQ CX
|
|
|
|
+ INCQ R9
|
|
|
|
+ DECQ R13
|
|
|
|
+ JNZ copy_slow_3
|
|
|
|
+
|
|
|
|
+handle_loop:
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ DECQ 96(CX)
|
|
|
|
+ JNS sequenceDecs_decodeSync_bmi2_main_loop
|
|
|
|
+
|
|
|
|
+loop_finished:
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ AX, 32(CX)
|
|
|
|
+ MOVB DL, 40(CX)
|
|
|
|
+ MOVQ BX, 24(CX)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ R11, 136(AX)
|
|
|
|
+ MOVQ 144(AX), CX
|
|
|
|
+ SUBQ CX, R10
|
|
|
|
+ MOVQ R10, 168(AX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ 16(SP), AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ AX, 216(CX)
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+error_match_off_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 8(SP), CX
|
|
|
|
+ MOVQ CX, 224(AX)
|
|
|
|
+ MOVQ R11, 136(AX)
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+error_not_enough_space:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ R11, 136(AX)
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
|
|
+// Requires: CMOV, SSE
|
|
|
|
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ 32(AX), DX
|
|
|
|
+ MOVBQZX 40(AX), BX
|
|
|
|
+ MOVQ 24(AX), SI
|
|
|
|
+ MOVQ (AX), AX
|
|
|
|
+ ADDQ SI, AX
|
|
|
|
+ MOVQ AX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 72(AX), DI
|
|
|
|
+ MOVQ 80(AX), R8
|
|
|
|
+ MOVQ 88(AX), R9
|
|
|
|
+ XORQ CX, CX
|
|
|
|
+ MOVQ CX, 8(SP)
|
|
|
|
+ MOVQ CX, 16(SP)
|
|
|
|
+ MOVQ CX, 24(SP)
|
|
|
|
+ MOVQ 112(AX), R10
|
|
|
|
+ MOVQ 128(AX), CX
|
|
|
|
+ MOVQ CX, 32(SP)
|
|
|
|
+ MOVQ 144(AX), R11
|
|
|
|
+ MOVQ 136(AX), R12
|
|
|
|
+ MOVQ 200(AX), CX
|
|
|
|
+ MOVQ CX, 56(SP)
|
|
|
|
+ MOVQ 176(AX), CX
|
|
|
|
+ MOVQ CX, 48(SP)
|
|
|
|
+ MOVQ 184(AX), AX
|
|
|
|
+ MOVQ AX, 40(SP)
|
|
|
|
+ MOVQ 40(SP), AX
|
|
|
|
+ ADDQ AX, 48(SP)
|
|
|
|
+
|
|
|
|
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
|
|
+ ADDQ R10, 32(SP)
|
|
|
|
+
|
|
|
|
+ // outBase += outPosition
|
|
|
|
+ ADDQ R12, R10
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_main_loop:
|
|
|
|
+ MOVQ (SP), R13
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ SI, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
|
|
|
|
+ MOVQ BX, AX
|
|
|
|
+ SHRQ $0x03, AX
|
|
|
|
+ SUBQ AX, R13
|
|
|
|
+ MOVQ (R13), DX
|
|
|
|
+ SUBQ AX, SI
|
|
|
|
+ ANDQ $0x07, BX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
|
|
|
|
+ CMPQ SI, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
|
|
+ CMPQ BX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
|
|
+ SHLQ $0x08, DX
|
|
|
|
+ SUBQ $0x01, R13
|
|
|
|
+ SUBQ $0x01, SI
|
|
|
|
+ SUBQ $0x08, BX
|
|
|
|
+ MOVBQZX (R13), AX
|
|
|
|
+ ORQ AX, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ SHLQ CL, R14
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R14
|
|
|
|
+ ADDQ R14, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
|
|
|
|
+ MOVQ AX, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ R8, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ SHLQ CL, R14
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R14
|
|
|
|
+ ADDQ R14, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
|
|
|
|
+ MOVQ AX, 16(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the remaining
|
|
|
|
+ CMPQ SI, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
|
|
|
|
+ MOVQ BX, AX
|
|
|
|
+ SHRQ $0x03, AX
|
|
|
|
+ SUBQ AX, R13
|
|
|
|
+ MOVQ (R13), DX
|
|
|
|
+ SUBQ AX, SI
|
|
|
|
+ ANDQ $0x07, BX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
|
|
|
|
+ CMPQ SI, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
|
|
+ CMPQ BX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
|
|
+ SHLQ $0x08, DX
|
|
|
|
+ SUBQ $0x01, R13
|
|
|
|
+ SUBQ $0x01, SI
|
|
|
|
+ SUBQ $0x08, BX
|
|
|
|
+ MOVBQZX (R13), AX
|
|
|
|
+ ORQ AX, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ DI, AX
|
|
|
|
+ MOVQ BX, CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ SHLQ CL, R14
|
|
|
|
+ MOVB AH, CL
|
|
|
|
+ SHRQ $0x20, AX
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
|
|
+ ADDQ CX, BX
|
|
|
|
+ CMPQ BX, $0x40
|
|
|
|
+ JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
|
|
+ CMPQ CX, $0x40
|
|
|
|
+ JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
|
|
+ NEGQ CX
|
|
|
|
+ SHRQ CL, R14
|
|
|
|
+ ADDQ R14, AX
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
|
|
|
|
+ MOVQ AX, 24(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R13, (SP)
|
|
|
|
+ MOVQ R9, AX
|
|
|
|
+ SHRQ $0x08, AX
|
|
|
|
+ MOVBQZX AL, AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decodeSync_safe_amd64_skip_update
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ MOVBQZX DI, R13
|
|
|
|
+ SHRQ $0x10, DI
|
|
|
|
+ MOVWQZX DI, DI
|
|
|
|
+ LEAQ (BX)(R13*1), CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ MOVL $0x00000001, R15
|
|
|
|
+ MOVB R13, CL
|
|
|
|
+ SHLL CL, R15
|
|
|
|
+ DECL R15
|
|
|
|
+ ANDQ R15, R14
|
|
|
|
+ ADDQ R14, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ MOVBQZX R8, R13
|
|
|
|
+ SHRQ $0x10, R8
|
|
|
|
+ MOVWQZX R8, R8
|
|
|
|
+ LEAQ (BX)(R13*1), CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ MOVL $0x00000001, R15
|
|
|
|
+ MOVB R13, CL
|
|
|
|
+ SHLL CL, R15
|
|
|
|
+ DECL R15
|
|
|
|
+ ANDQ R15, R14
|
|
|
|
+ ADDQ R14, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ MOVBQZX R9, R13
|
|
|
|
+ SHRQ $0x10, R9
|
|
|
|
+ MOVWQZX R9, R9
|
|
|
|
+ LEAQ (BX)(R13*1), CX
|
|
|
|
+ MOVQ DX, R14
|
|
|
|
+ MOVQ CX, BX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ MOVL $0x00000001, R15
|
|
|
|
+ MOVB R13, CL
|
|
|
|
+ SHLL CL, R15
|
|
|
|
+ DECL R15
|
|
|
|
+ ANDQ R15, R14
|
|
|
|
+ ADDQ R14, R9
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R9*8), R9
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ 8(SP), R13
|
|
|
|
+ CMPQ AX, $0x01
|
|
|
|
+ JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
|
|
|
|
+ MOVUPS 144(CX), X0
|
|
|
|
+ MOVQ R13, 144(CX)
|
|
|
|
+ MOVUPS X0, 152(CX)
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ 24(SP), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
|
|
|
|
+ INCQ R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
|
|
|
|
+ MOVQ 144(CX), R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
|
|
|
|
+ MOVQ R13, AX
|
|
|
|
+ XORQ R14, R14
|
|
|
|
+ MOVQ $-1, R15
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ CMOVQEQ R14, AX
|
|
|
|
+ CMOVQEQ R15, R14
|
|
|
|
+ ADDQ 144(CX)(AX*8), R14
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, R14
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
|
|
|
|
+ CMPQ R13, $0x01
|
|
|
|
+ JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
|
|
|
|
+ MOVQ 152(CX), AX
|
|
|
|
+ MOVQ AX, 160(CX)
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
|
|
|
|
+ MOVQ 144(CX), AX
|
|
|
|
+ MOVQ AX, 152(CX)
|
|
|
|
+ MOVQ R14, 144(CX)
|
|
|
|
+ MOVQ R14, R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
|
|
|
|
+ MOVQ R13, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 16(SP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ LEAQ (AX)(CX*1), R14
|
|
|
|
+ MOVQ s+0(FP), R15
|
|
|
|
+ ADDQ R14, 256(R15)
|
|
|
|
+ MOVQ ctx+16(FP), R14
|
|
|
|
+ SUBQ CX, 104(R14)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ AX, $0x00020002
|
|
|
|
+ JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
|
|
|
|
+ MOVQ 24(SP), AX
|
|
|
|
+ MOVQ 8(SP), CX
|
|
|
|
+ MOVQ 16(SP), R13
|
|
|
|
+
|
|
|
|
+ // Check if we have enough space in s.out
|
|
|
|
+ LEAQ (AX)(R13*1), R14
|
|
|
|
+ ADDQ R10, R14
|
|
|
|
+ CMPQ R14, 32(SP)
|
|
|
|
+ JA error_not_enough_space
|
|
|
|
+
|
|
|
|
+ // Copy literals
|
|
|
|
+ TESTQ AX, AX
|
|
|
|
+ JZ check_offset
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ SUBQ $0x10, R14
|
|
|
|
+ JB copy_1_small
|
|
|
|
+
|
|
|
|
+copy_1_loop:
|
|
|
|
+ MOVUPS (R11), X0
|
|
|
|
+ MOVUPS X0, (R10)
|
|
|
|
+ ADDQ $0x10, R11
|
|
|
|
+ ADDQ $0x10, R10
|
|
|
|
+ SUBQ $0x10, R14
|
|
|
|
+ JAE copy_1_loop
|
|
|
|
+ LEAQ 16(R11)(R14*1), R11
|
|
|
|
+ LEAQ 16(R10)(R14*1), R10
|
|
|
|
+ MOVUPS -16(R11), X0
|
|
|
|
+ MOVUPS X0, -16(R10)
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_small:
|
|
|
|
+ CMPQ AX, $0x03
|
|
|
|
+ JE copy_1_move_3
|
|
|
|
+ JB copy_1_move_1or2
|
|
|
|
+ CMPQ AX, $0x08
|
|
|
|
+ JB copy_1_move_4through7
|
|
|
|
+ JMP copy_1_move_8through16
|
|
|
|
+
|
|
|
|
+copy_1_move_1or2:
|
|
|
|
+ MOVB (R11), R14
|
|
|
|
+ MOVB -1(R11)(AX*1), R15
|
|
|
|
+ MOVB R14, (R10)
|
|
|
|
+ MOVB R15, -1(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R11
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_3:
|
|
|
|
+ MOVW (R11), R14
|
|
|
|
+ MOVB 2(R11), R15
|
|
|
|
+ MOVW R14, (R10)
|
|
|
|
+ MOVB R15, 2(R10)
|
|
|
|
+ ADDQ AX, R11
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_4through7:
|
|
|
|
+ MOVL (R11), R14
|
|
|
|
+ MOVL -4(R11)(AX*1), R15
|
|
|
|
+ MOVL R14, (R10)
|
|
|
|
+ MOVL R15, -4(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R11
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_8through16:
|
|
|
|
+ MOVQ (R11), R14
|
|
|
|
+ MOVQ -8(R11)(AX*1), R15
|
|
|
|
+ MOVQ R14, (R10)
|
|
|
|
+ MOVQ R15, -8(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R11
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+
|
|
|
|
+copy_1_end:
|
|
|
|
+ ADDQ AX, R12
|
|
|
|
+
|
|
|
|
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
|
|
+check_offset:
|
|
|
|
+ MOVQ R12, AX
|
|
|
|
+ ADDQ 40(SP), AX
|
|
|
|
+ CMPQ CX, AX
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+ CMPQ CX, 56(SP)
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+
|
|
|
|
+ // Copy match from history
|
|
|
|
+ MOVQ CX, AX
|
|
|
|
+ SUBQ R12, AX
|
|
|
|
+ JLS copy_match
|
|
|
|
+ MOVQ 48(SP), R14
|
|
|
|
+ SUBQ AX, R14
|
|
|
|
+ CMPQ R13, AX
|
|
|
|
+ JG copy_all_from_history
|
|
|
|
+ MOVQ R13, AX
|
|
|
|
+ SUBQ $0x10, AX
|
|
|
|
+ JB copy_4_small
|
|
|
|
+
|
|
|
|
+copy_4_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R10)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R10
|
|
|
|
+ SUBQ $0x10, AX
|
|
|
|
+ JAE copy_4_loop
|
|
|
|
+ LEAQ 16(R14)(AX*1), R14
|
|
|
|
+ LEAQ 16(R10)(AX*1), R10
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R10)
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_4_move_3
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_4_move_4through7
|
|
|
|
+ JMP copy_4_move_8through16
|
|
|
|
+
|
|
|
|
+copy_4_move_3:
|
|
|
|
+ MOVW (R14), AX
|
|
|
|
+ MOVB 2(R14), CL
|
|
|
|
+ MOVW AX, (R10)
|
|
|
|
+ MOVB CL, 2(R10)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_4through7:
|
|
|
|
+ MOVL (R14), AX
|
|
|
|
+ MOVL -4(R14)(R13*1), CX
|
|
|
|
+ MOVL AX, (R10)
|
|
|
|
+ MOVL CX, -4(R10)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_8through16:
|
|
|
|
+ MOVQ (R14), AX
|
|
|
|
+ MOVQ -8(R14)(R13*1), CX
|
|
|
|
+ MOVQ AX, (R10)
|
|
|
|
+ MOVQ CX, -8(R10)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+
|
|
|
|
+copy_4_end:
|
|
|
|
+ ADDQ R13, R12
|
|
|
|
+ JMP handle_loop
|
|
|
|
+ JMP loop_finished
|
|
|
|
+
|
|
|
|
+copy_all_from_history:
|
|
|
|
+ MOVQ AX, R15
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JB copy_5_small
|
|
|
|
+
|
|
|
|
+copy_5_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R10)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R10
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JAE copy_5_loop
|
|
|
|
+ LEAQ 16(R14)(R15*1), R14
|
|
|
|
+ LEAQ 16(R10)(R15*1), R10
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R10)
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_small:
|
|
|
|
+ CMPQ AX, $0x03
|
|
|
|
+ JE copy_5_move_3
|
|
|
|
+ JB copy_5_move_1or2
|
|
|
|
+ CMPQ AX, $0x08
|
|
|
|
+ JB copy_5_move_4through7
|
|
|
|
+ JMP copy_5_move_8through16
|
|
|
|
+
|
|
|
|
+copy_5_move_1or2:
|
|
|
|
+ MOVB (R14), R15
|
|
|
|
+ MOVB -1(R14)(AX*1), BP
|
|
|
|
+ MOVB R15, (R10)
|
|
|
|
+ MOVB BP, -1(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_3:
|
|
|
|
+ MOVW (R14), R15
|
|
|
|
+ MOVB 2(R14), BP
|
|
|
|
+ MOVW R15, (R10)
|
|
|
|
+ MOVB BP, 2(R10)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_4through7:
|
|
|
|
+ MOVL (R14), R15
|
|
|
|
+ MOVL -4(R14)(AX*1), BP
|
|
|
|
+ MOVL R15, (R10)
|
|
|
|
+ MOVL BP, -4(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_8through16:
|
|
|
|
+ MOVQ (R14), R15
|
|
|
|
+ MOVQ -8(R14)(AX*1), BP
|
|
|
|
+ MOVQ R15, (R10)
|
|
|
|
+ MOVQ BP, -8(R10)(AX*1)
|
|
|
|
+ ADDQ AX, R14
|
|
|
|
+ ADDQ AX, R10
|
|
|
|
+
|
|
|
|
+copy_5_end:
|
|
|
|
+ ADDQ AX, R12
|
|
|
|
+ SUBQ AX, R13
|
|
|
|
+
|
|
|
|
+ // Copy match from the current buffer
|
|
|
|
+copy_match:
|
|
|
|
+ MOVQ R10, AX
|
|
|
|
+ SUBQ CX, AX
|
|
|
|
+
|
|
|
|
+ // ml <= mo
|
|
|
|
+ CMPQ R13, CX
|
|
|
|
+ JA copy_overlapping_match
|
|
|
|
+
|
|
|
|
+ // Copy non-overlapping match
|
|
|
|
+ ADDQ R13, R12
|
|
|
|
+ MOVQ R13, CX
|
|
|
|
+ SUBQ $0x10, CX
|
|
|
|
+ JB copy_2_small
|
|
|
|
+
|
|
|
|
+copy_2_loop:
|
|
|
|
+ MOVUPS (AX), X0
|
|
|
|
+ MOVUPS X0, (R10)
|
|
|
|
+ ADDQ $0x10, AX
|
|
|
|
+ ADDQ $0x10, R10
|
|
|
|
+ SUBQ $0x10, CX
|
|
|
|
+ JAE copy_2_loop
|
|
|
|
+ LEAQ 16(AX)(CX*1), AX
|
|
|
|
+ LEAQ 16(R10)(CX*1), R10
|
|
|
|
+ MOVUPS -16(AX), X0
|
|
|
|
+ MOVUPS X0, -16(R10)
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_2_move_3
|
|
|
|
+ JB copy_2_move_1or2
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_2_move_4through7
|
|
|
|
+ JMP copy_2_move_8through16
|
|
|
|
+
|
|
|
|
+copy_2_move_1or2:
|
|
|
|
+ MOVB (AX), CL
|
|
|
|
+ MOVB -1(AX)(R13*1), R14
|
|
|
|
+ MOVB CL, (R10)
|
|
|
|
+ MOVB R14, -1(R10)(R13*1)
|
|
|
|
+ ADDQ R13, AX
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_3:
|
|
|
|
+ MOVW (AX), CX
|
|
|
|
+ MOVB 2(AX), R14
|
|
|
|
+ MOVW CX, (R10)
|
|
|
|
+ MOVB R14, 2(R10)
|
|
|
|
+ ADDQ R13, AX
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_4through7:
|
|
|
|
+ MOVL (AX), CX
|
|
|
|
+ MOVL -4(AX)(R13*1), R14
|
|
|
|
+ MOVL CX, (R10)
|
|
|
|
+ MOVL R14, -4(R10)(R13*1)
|
|
|
|
+ ADDQ R13, AX
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_8through16:
|
|
|
|
+ MOVQ (AX), CX
|
|
|
|
+ MOVQ -8(AX)(R13*1), R14
|
|
|
|
+ MOVQ CX, (R10)
|
|
|
|
+ MOVQ R14, -8(R10)(R13*1)
|
|
|
|
+ ADDQ R13, AX
|
|
|
|
+ ADDQ R13, R10
|
|
|
|
+
|
|
|
|
+copy_2_end:
|
|
|
|
+ JMP handle_loop
|
|
|
|
+
|
|
|
|
+ // Copy overlapping match
|
|
|
|
+copy_overlapping_match:
|
|
|
|
+ ADDQ R13, R12
|
|
|
|
+
|
|
|
|
+copy_slow_3:
|
|
|
|
+ MOVB (AX), CL
|
|
|
|
+ MOVB CL, (R10)
|
|
|
|
+ INCQ AX
|
|
|
|
+ INCQ R10
|
|
|
|
+ DECQ R13
|
|
|
|
+ JNZ copy_slow_3
|
|
|
|
+
|
|
|
|
+handle_loop:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ DECQ 96(AX)
|
|
|
|
+ JNS sequenceDecs_decodeSync_safe_amd64_main_loop
|
|
|
|
+
|
|
|
|
+loop_finished:
|
|
|
|
+ MOVQ br+8(FP), AX
|
|
|
|
+ MOVQ DX, 32(AX)
|
|
|
|
+ MOVB BL, 40(AX)
|
|
|
|
+ MOVQ SI, 24(AX)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ R12, 136(AX)
|
|
|
|
+ MOVQ 144(AX), CX
|
|
|
|
+ SUBQ CX, R11
|
|
|
|
+ MOVQ R11, 168(AX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ 16(SP), AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ AX, 216(CX)
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+error_match_off_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 8(SP), CX
|
|
|
|
+ MOVQ CX, 224(AX)
|
|
|
|
+ MOVQ R12, 136(AX)
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+error_not_enough_space:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ R12, 136(AX)
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
|
|
+// Requires: BMI, BMI2, CMOV, SSE
|
|
|
|
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ 32(CX), AX
|
|
|
|
+ MOVBQZX 40(CX), DX
|
|
|
|
+ MOVQ 24(CX), BX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ ADDQ BX, CX
|
|
|
|
+ MOVQ CX, (SP)
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 72(CX), SI
|
|
|
|
+ MOVQ 80(CX), DI
|
|
|
|
+ MOVQ 88(CX), R8
|
|
|
|
+ XORQ R9, R9
|
|
|
|
+ MOVQ R9, 8(SP)
|
|
|
|
+ MOVQ R9, 16(SP)
|
|
|
|
+ MOVQ R9, 24(SP)
|
|
|
|
+ MOVQ 112(CX), R9
|
|
|
|
+ MOVQ 128(CX), R10
|
|
|
|
+ MOVQ R10, 32(SP)
|
|
|
|
+ MOVQ 144(CX), R10
|
|
|
|
+ MOVQ 136(CX), R11
|
|
|
|
+ MOVQ 200(CX), R12
|
|
|
|
+ MOVQ R12, 56(SP)
|
|
|
|
+ MOVQ 176(CX), R12
|
|
|
|
+ MOVQ R12, 48(SP)
|
|
|
|
+ MOVQ 184(CX), CX
|
|
|
|
+ MOVQ CX, 40(SP)
|
|
|
|
+ MOVQ 40(SP), CX
|
|
|
|
+ ADDQ CX, 48(SP)
|
|
|
|
+
|
|
|
|
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
|
|
+ ADDQ R9, 32(SP)
|
|
|
|
+
|
|
|
|
+ // outBase += outPosition
|
|
|
|
+ ADDQ R11, R9
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
|
|
|
|
+ MOVQ (SP), R12
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the offset and match length.
|
|
|
|
+ CMPQ BX, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
|
|
|
|
+ MOVQ DX, CX
|
|
|
|
+ SHRQ $0x03, CX
|
|
|
|
+ SUBQ CX, R12
|
|
|
|
+ MOVQ (R12), AX
|
|
|
|
+ SUBQ CX, BX
|
|
|
|
+ ANDQ $0x07, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
|
|
|
|
+ CMPQ BX, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
|
|
+ CMPQ DX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
|
|
+ SHLQ $0x08, AX
|
|
|
|
+ SUBQ $0x01, R12
|
|
|
|
+ SUBQ $0x01, BX
|
|
|
|
+ SUBQ $0x08, DX
|
|
|
|
+ MOVBQZX (R12), CX
|
|
|
|
+ ORQ CX, AX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
|
|
|
|
+ // Update offset
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R13
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ R8, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R14, CX
|
|
|
|
+ MOVQ CX, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Update match length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, DI, R13
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ DI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R14, CX
|
|
|
|
+ MOVQ CX, 16(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader to have enough for the remaining
|
|
|
|
+ CMPQ BX, $0x08
|
|
|
|
+ JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
|
|
|
|
+ MOVQ DX, CX
|
|
|
|
+ SHRQ $0x03, CX
|
|
|
|
+ SUBQ CX, R12
|
|
|
|
+ MOVQ (R12), AX
|
|
|
|
+ SUBQ CX, BX
|
|
|
|
+ ANDQ $0x07, DX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
|
|
|
|
+ CMPQ BX, $0x00
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
|
|
+ CMPQ DX, $0x07
|
|
|
|
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
|
|
+ SHLQ $0x08, AX
|
|
|
|
+ SUBQ $0x01, R12
|
|
|
|
+ SUBQ $0x01, BX
|
|
|
|
+ SUBQ $0x08, DX
|
|
|
|
+ MOVBQZX (R12), CX
|
|
|
|
+ ORQ CX, AX
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
|
|
|
|
+ // Update literal length
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, SI, R13
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ MOVQ SI, CX
|
|
|
|
+ SHRQ $0x20, CX
|
|
|
|
+ ADDQ R14, CX
|
|
|
|
+ MOVQ CX, 24(SP)
|
|
|
|
+
|
|
|
|
+ // Fill bitreader for state updates
|
|
|
|
+ MOVQ R12, (SP)
|
|
|
|
+ MOVQ $0x00000808, CX
|
|
|
|
+ BEXTRQ CX, R8, R12
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ CMPQ 96(CX), $0x00
|
|
|
|
+ JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
|
|
|
|
+ LEAQ (SI)(DI*1), R13
|
|
|
|
+ ADDQ R8, R13
|
|
|
|
+ MOVBQZX R13, R13
|
|
|
|
+ LEAQ (DX)(R13*1), CX
|
|
|
|
+ MOVQ AX, R14
|
|
|
|
+ MOVQ CX, DX
|
|
|
|
+ ROLQ CL, R14
|
|
|
|
+ BZHIQ R13, R14, R14
|
|
|
|
+
|
|
|
|
+ // Update Offset State
|
|
|
|
+ BZHIQ R8, R14, CX
|
|
|
|
+ SHRXQ R8, R14, R14
|
|
|
|
+ MOVQ $0x00001010, R13
|
|
|
|
+ BEXTRQ R13, R8, R8
|
|
|
|
+ ADDQ CX, R8
|
|
|
|
+
|
|
|
|
+ // Load ctx.ofTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 48(CX), CX
|
|
|
|
+ MOVQ (CX)(R8*8), R8
|
|
|
|
+
|
|
|
|
+ // Update Match Length State
|
|
|
|
+ BZHIQ DI, R14, CX
|
|
|
|
+ SHRXQ DI, R14, R14
|
|
|
|
+ MOVQ $0x00001010, R13
|
|
|
|
+ BEXTRQ R13, DI, DI
|
|
|
|
+ ADDQ CX, DI
|
|
|
|
+
|
|
|
|
+ // Load ctx.mlTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ 24(CX), CX
|
|
|
|
+ MOVQ (CX)(DI*8), DI
|
|
|
|
+
|
|
|
|
+ // Update Literal Length State
|
|
|
|
+ BZHIQ SI, R14, CX
|
|
|
|
+ MOVQ $0x00001010, R13
|
|
|
|
+ BEXTRQ R13, SI, SI
|
|
|
|
+ ADDQ CX, SI
|
|
|
|
+
|
|
|
|
+ // Load ctx.llTable
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ (CX), CX
|
|
|
|
+ MOVQ (CX)(SI*8), SI
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
|
|
|
|
+ // Adjust offset
|
|
|
|
+ MOVQ s+0(FP), CX
|
|
|
|
+ MOVQ 8(SP), R13
|
|
|
|
+ CMPQ R12, $0x01
|
|
|
|
+ JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
|
|
|
|
+ MOVUPS 144(CX), X0
|
|
|
|
+ MOVQ R13, 144(CX)
|
|
|
|
+ MOVUPS X0, 152(CX)
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
|
|
|
|
+ CMPQ 24(SP), $0x00000000
|
|
|
|
+ JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
|
|
|
|
+ INCQ R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
|
|
|
|
+ MOVQ 144(CX), R13
|
|
|
|
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
|
|
|
|
+ MOVQ R13, R12
|
|
|
|
+ XORQ R14, R14
|
|
|
|
+ MOVQ $-1, R15
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ CMOVQEQ R14, R12
|
|
|
|
+ CMOVQEQ R15, R14
|
|
|
|
+ ADDQ 144(CX)(R12*8), R14
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
|
|
|
|
+ MOVQ $0x00000001, R14
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
|
|
|
|
+ CMPQ R13, $0x01
|
|
|
|
+ JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
|
|
|
|
+ MOVQ 152(CX), R12
|
|
|
|
+ MOVQ R12, 160(CX)
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
|
|
|
|
+ MOVQ 144(CX), R12
|
|
|
|
+ MOVQ R12, 152(CX)
|
|
|
|
+ MOVQ R14, 144(CX)
|
|
|
|
+ MOVQ R14, R13
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
|
|
|
|
+ MOVQ R13, 8(SP)
|
|
|
|
+
|
|
|
|
+ // Check values
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ 24(SP), R12
|
|
|
|
+ LEAQ (CX)(R12*1), R14
|
|
|
|
+ MOVQ s+0(FP), R15
|
|
|
|
+ ADDQ R14, 256(R15)
|
|
|
|
+ MOVQ ctx+16(FP), R14
|
|
|
|
+ SUBQ R12, 104(R14)
|
|
|
|
+ JS error_not_enough_literals
|
|
|
|
+ CMPQ CX, $0x00020002
|
|
|
|
+ JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
|
|
|
|
+ TESTQ R13, R13
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
|
|
|
|
+
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ 8(SP), R12
|
|
|
|
+ MOVQ 16(SP), R13
|
|
|
|
+
|
|
|
|
+ // Check if we have enough space in s.out
|
|
|
|
+ LEAQ (CX)(R13*1), R14
|
|
|
|
+ ADDQ R9, R14
|
|
|
|
+ CMPQ R14, 32(SP)
|
|
|
|
+ JA error_not_enough_space
|
|
|
|
+
|
|
|
|
+ // Copy literals
|
|
|
|
+ TESTQ CX, CX
|
|
|
|
+ JZ check_offset
|
|
|
|
+ MOVQ CX, R14
|
|
|
|
+ SUBQ $0x10, R14
|
|
|
|
+ JB copy_1_small
|
|
|
|
+
|
|
|
|
+copy_1_loop:
|
|
|
|
+ MOVUPS (R10), X0
|
|
|
|
+ MOVUPS X0, (R9)
|
|
|
|
+ ADDQ $0x10, R10
|
|
|
|
+ ADDQ $0x10, R9
|
|
|
|
+ SUBQ $0x10, R14
|
|
|
|
+ JAE copy_1_loop
|
|
|
|
+ LEAQ 16(R10)(R14*1), R10
|
|
|
|
+ LEAQ 16(R9)(R14*1), R9
|
|
|
|
+ MOVUPS -16(R10), X0
|
|
|
|
+ MOVUPS X0, -16(R9)
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_small:
|
|
|
|
+ CMPQ CX, $0x03
|
|
|
|
+ JE copy_1_move_3
|
|
|
|
+ JB copy_1_move_1or2
|
|
|
|
+ CMPQ CX, $0x08
|
|
|
|
+ JB copy_1_move_4through7
|
|
|
|
+ JMP copy_1_move_8through16
|
|
|
|
+
|
|
|
|
+copy_1_move_1or2:
|
|
|
|
+ MOVB (R10), R14
|
|
|
|
+ MOVB -1(R10)(CX*1), R15
|
|
|
|
+ MOVB R14, (R9)
|
|
|
|
+ MOVB R15, -1(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R10
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_3:
|
|
|
|
+ MOVW (R10), R14
|
|
|
|
+ MOVB 2(R10), R15
|
|
|
|
+ MOVW R14, (R9)
|
|
|
|
+ MOVB R15, 2(R9)
|
|
|
|
+ ADDQ CX, R10
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_4through7:
|
|
|
|
+ MOVL (R10), R14
|
|
|
|
+ MOVL -4(R10)(CX*1), R15
|
|
|
|
+ MOVL R14, (R9)
|
|
|
|
+ MOVL R15, -4(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R10
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_1_end
|
|
|
|
+
|
|
|
|
+copy_1_move_8through16:
|
|
|
|
+ MOVQ (R10), R14
|
|
|
|
+ MOVQ -8(R10)(CX*1), R15
|
|
|
|
+ MOVQ R14, (R9)
|
|
|
|
+ MOVQ R15, -8(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R10
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+
|
|
|
|
+copy_1_end:
|
|
|
|
+ ADDQ CX, R11
|
|
|
|
+
|
|
|
|
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
|
|
+check_offset:
|
|
|
|
+ MOVQ R11, CX
|
|
|
|
+ ADDQ 40(SP), CX
|
|
|
|
+ CMPQ R12, CX
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+ CMPQ R12, 56(SP)
|
|
|
|
+ JG error_match_off_too_big
|
|
|
|
+
|
|
|
|
+ // Copy match from history
|
|
|
|
+ MOVQ R12, CX
|
|
|
|
+ SUBQ R11, CX
|
|
|
|
+ JLS copy_match
|
|
|
|
+ MOVQ 48(SP), R14
|
|
|
|
+ SUBQ CX, R14
|
|
|
|
+ CMPQ R13, CX
|
|
|
|
+ JG copy_all_from_history
|
|
|
|
+ MOVQ R13, CX
|
|
|
|
+ SUBQ $0x10, CX
|
|
|
|
+ JB copy_4_small
|
|
|
|
+
|
|
|
|
+copy_4_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R9)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R9
|
|
|
|
+ SUBQ $0x10, CX
|
|
|
|
+ JAE copy_4_loop
|
|
|
|
+ LEAQ 16(R14)(CX*1), R14
|
|
|
|
+ LEAQ 16(R9)(CX*1), R9
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R9)
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_4_move_3
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_4_move_4through7
|
|
|
|
+ JMP copy_4_move_8through16
|
|
|
|
+
|
|
|
|
+copy_4_move_3:
|
|
|
|
+ MOVW (R14), CX
|
|
|
|
+ MOVB 2(R14), R12
|
|
|
|
+ MOVW CX, (R9)
|
|
|
|
+ MOVB R12, 2(R9)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_4through7:
|
|
|
|
+ MOVL (R14), CX
|
|
|
|
+ MOVL -4(R14)(R13*1), R12
|
|
|
|
+ MOVL CX, (R9)
|
|
|
|
+ MOVL R12, -4(R9)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+ JMP copy_4_end
|
|
|
|
+
|
|
|
|
+copy_4_move_8through16:
|
|
|
|
+ MOVQ (R14), CX
|
|
|
|
+ MOVQ -8(R14)(R13*1), R12
|
|
|
|
+ MOVQ CX, (R9)
|
|
|
|
+ MOVQ R12, -8(R9)(R13*1)
|
|
|
|
+ ADDQ R13, R14
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+
|
|
|
|
+copy_4_end:
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ JMP handle_loop
|
|
|
|
+ JMP loop_finished
|
|
|
|
+
|
|
|
|
+copy_all_from_history:
|
|
|
|
+ MOVQ CX, R15
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JB copy_5_small
|
|
|
|
+
|
|
|
|
+copy_5_loop:
|
|
|
|
+ MOVUPS (R14), X0
|
|
|
|
+ MOVUPS X0, (R9)
|
|
|
|
+ ADDQ $0x10, R14
|
|
|
|
+ ADDQ $0x10, R9
|
|
|
|
+ SUBQ $0x10, R15
|
|
|
|
+ JAE copy_5_loop
|
|
|
|
+ LEAQ 16(R14)(R15*1), R14
|
|
|
|
+ LEAQ 16(R9)(R15*1), R9
|
|
|
|
+ MOVUPS -16(R14), X0
|
|
|
|
+ MOVUPS X0, -16(R9)
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_small:
|
|
|
|
+ CMPQ CX, $0x03
|
|
|
|
+ JE copy_5_move_3
|
|
|
|
+ JB copy_5_move_1or2
|
|
|
|
+ CMPQ CX, $0x08
|
|
|
|
+ JB copy_5_move_4through7
|
|
|
|
+ JMP copy_5_move_8through16
|
|
|
|
+
|
|
|
|
+copy_5_move_1or2:
|
|
|
|
+ MOVB (R14), R15
|
|
|
|
+ MOVB -1(R14)(CX*1), BP
|
|
|
|
+ MOVB R15, (R9)
|
|
|
|
+ MOVB BP, -1(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_3:
|
|
|
|
+ MOVW (R14), R15
|
|
|
|
+ MOVB 2(R14), BP
|
|
|
|
+ MOVW R15, (R9)
|
|
|
|
+ MOVB BP, 2(R9)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_4through7:
|
|
|
|
+ MOVL (R14), R15
|
|
|
|
+ MOVL -4(R14)(CX*1), BP
|
|
|
|
+ MOVL R15, (R9)
|
|
|
|
+ MOVL BP, -4(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+ JMP copy_5_end
|
|
|
|
+
|
|
|
|
+copy_5_move_8through16:
|
|
|
|
+ MOVQ (R14), R15
|
|
|
|
+ MOVQ -8(R14)(CX*1), BP
|
|
|
|
+ MOVQ R15, (R9)
|
|
|
|
+ MOVQ BP, -8(R9)(CX*1)
|
|
|
|
+ ADDQ CX, R14
|
|
|
|
+ ADDQ CX, R9
|
|
|
|
+
|
|
|
|
+copy_5_end:
|
|
|
|
+ ADDQ CX, R11
|
|
|
|
+ SUBQ CX, R13
|
|
|
|
+
|
|
|
|
+ // Copy match from the current buffer
|
|
|
|
+copy_match:
|
|
|
|
+ MOVQ R9, CX
|
|
|
|
+ SUBQ R12, CX
|
|
|
|
+
|
|
|
|
+ // ml <= mo
|
|
|
|
+ CMPQ R13, R12
|
|
|
|
+ JA copy_overlapping_match
|
|
|
|
+
|
|
|
|
+ // Copy non-overlapping match
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+ MOVQ R13, R12
|
|
|
|
+ SUBQ $0x10, R12
|
|
|
|
+ JB copy_2_small
|
|
|
|
+
|
|
|
|
+copy_2_loop:
|
|
|
|
+ MOVUPS (CX), X0
|
|
|
|
+ MOVUPS X0, (R9)
|
|
|
|
+ ADDQ $0x10, CX
|
|
|
|
+ ADDQ $0x10, R9
|
|
|
|
+ SUBQ $0x10, R12
|
|
|
|
+ JAE copy_2_loop
|
|
|
|
+ LEAQ 16(CX)(R12*1), CX
|
|
|
|
+ LEAQ 16(R9)(R12*1), R9
|
|
|
|
+ MOVUPS -16(CX), X0
|
|
|
|
+ MOVUPS X0, -16(R9)
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_small:
|
|
|
|
+ CMPQ R13, $0x03
|
|
|
|
+ JE copy_2_move_3
|
|
|
|
+ JB copy_2_move_1or2
|
|
|
|
+ CMPQ R13, $0x08
|
|
|
|
+ JB copy_2_move_4through7
|
|
|
|
+ JMP copy_2_move_8through16
|
|
|
|
+
|
|
|
|
+copy_2_move_1or2:
|
|
|
|
+ MOVB (CX), R12
|
|
|
|
+ MOVB -1(CX)(R13*1), R14
|
|
|
|
+ MOVB R12, (R9)
|
|
|
|
+ MOVB R14, -1(R9)(R13*1)
|
|
|
|
+ ADDQ R13, CX
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_3:
|
|
|
|
+ MOVW (CX), R12
|
|
|
|
+ MOVB 2(CX), R14
|
|
|
|
+ MOVW R12, (R9)
|
|
|
|
+ MOVB R14, 2(R9)
|
|
|
|
+ ADDQ R13, CX
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_4through7:
|
|
|
|
+ MOVL (CX), R12
|
|
|
|
+ MOVL -4(CX)(R13*1), R14
|
|
|
|
+ MOVL R12, (R9)
|
|
|
|
+ MOVL R14, -4(R9)(R13*1)
|
|
|
|
+ ADDQ R13, CX
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+ JMP copy_2_end
|
|
|
|
+
|
|
|
|
+copy_2_move_8through16:
|
|
|
|
+ MOVQ (CX), R12
|
|
|
|
+ MOVQ -8(CX)(R13*1), R14
|
|
|
|
+ MOVQ R12, (R9)
|
|
|
|
+ MOVQ R14, -8(R9)(R13*1)
|
|
|
|
+ ADDQ R13, CX
|
|
|
|
+ ADDQ R13, R9
|
|
|
|
+
|
|
|
|
+copy_2_end:
|
|
|
|
+ JMP handle_loop
|
|
|
|
+
|
|
|
|
+ // Copy overlapping match
|
|
|
|
+copy_overlapping_match:
|
|
|
|
+ ADDQ R13, R11
|
|
|
|
+
|
|
|
|
+copy_slow_3:
|
|
|
|
+ MOVB (CX), R12
|
|
|
|
+ MOVB R12, (R9)
|
|
|
|
+ INCQ CX
|
|
|
|
+ INCQ R9
|
|
|
|
+ DECQ R13
|
|
|
|
+ JNZ copy_slow_3
|
|
|
|
+
|
|
|
|
+handle_loop:
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ DECQ 96(CX)
|
|
|
|
+ JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
|
|
|
|
+
|
|
|
|
+loop_finished:
|
|
|
|
+ MOVQ br+8(FP), CX
|
|
|
|
+ MOVQ AX, 32(CX)
|
|
|
|
+ MOVB DL, 40(CX)
|
|
|
|
+ MOVQ BX, 24(CX)
|
|
|
|
+
|
|
|
|
+ // Update the context
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ R11, 136(AX)
|
|
|
|
+ MOVQ 144(AX), CX
|
|
|
|
+ SUBQ CX, R10
|
|
|
|
+ MOVQ R10, 168(AX)
|
|
|
|
+
|
|
|
|
+ // Return success
|
|
|
|
+ MOVQ $0x00000000, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match length error
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
|
|
|
|
+ MOVQ 16(SP), AX
|
|
|
|
+ MOVQ ctx+16(FP), CX
|
|
|
|
+ MOVQ AX, 216(CX)
|
|
|
|
+ MOVQ $0x00000001, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match too long error
|
|
|
|
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ $0x00000002, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with match offset too long error
|
|
|
|
+error_match_off_too_big:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 8(SP), CX
|
|
|
|
+ MOVQ CX, 224(AX)
|
|
|
|
+ MOVQ R11, 136(AX)
|
|
|
|
+ MOVQ $0x00000003, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough literals error
|
|
|
|
+error_not_enough_literals:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ $0x00000004, ret+24(FP)
|
|
|
|
+ RET
|
|
|
|
+
|
|
|
|
+ // Return with not enough output space error
|
|
|
|
+error_not_enough_space:
|
|
|
|
+ MOVQ ctx+16(FP), AX
|
|
|
|
+ MOVQ 24(SP), CX
|
|
|
|
+ MOVQ CX, 208(AX)
|
|
|
|
+ MOVQ 16(SP), CX
|
|
|
|
+ MOVQ CX, 216(AX)
|
|
|
|
+ MOVQ R11, 136(AX)
|
|
|
|
+ MOVQ $0x00000005, ret+24(FP)
|
|
|
|
+ RET
|