From 60ac254df62408bf1831d28c9a712ee3d27a1366 Mon Sep 17 00:00:00 2001 From: Tim Schumacher Date: Fri, 2 Jun 2023 13:37:47 +0200 Subject: [PATCH] AK: Use hashing to accelerate searching a `CircularBuffer` --- AK/CircularBuffer.cpp | 255 +++++++++++++++++++----- AK/CircularBuffer.h | 19 +- Tests/AK/TestCircularBuffer.cpp | 88 ++++---- Userland/Libraries/LibCompress/Lzma.cpp | 8 +- 4 files changed, 269 insertions(+), 101 deletions(-) diff --git a/AK/CircularBuffer.cpp b/AK/CircularBuffer.cpp index 362ba0310bd..43b08412685 100644 --- a/AK/CircularBuffer.cpp +++ b/AK/CircularBuffer.cpp @@ -296,10 +296,34 @@ ErrorOr SearchableCircularBuffer::create_initialized(B circular_buffer.m_used_space = circular_buffer.m_buffer.size(); + for (size_t i = 0; i + HASH_CHUNK_SIZE <= circular_buffer.m_buffer.size(); i++) + TRY(circular_buffer.insert_location_hash(circular_buffer.m_buffer.span().slice(i, HASH_CHUNK_SIZE), i)); + return circular_buffer; } -ErrorOr> SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length) const +ErrorOr SearchableCircularBuffer::read(Bytes bytes) +{ + auto read_bytes_span = CircularBuffer::read(bytes); + TRY(hash_last_bytes(read_bytes_span.size())); + return read_bytes_span; +} + +ErrorOr SearchableCircularBuffer::discard(size_t discarded_bytes) +{ + TRY(CircularBuffer::discard(discarded_bytes)); + TRY(hash_last_bytes(discarded_bytes)); + return {}; +} + +ErrorOr SearchableCircularBuffer::flush_to_stream(Stream& stream) +{ + auto flushed_byte_count = TRY(CircularBuffer::flush_to_stream(stream)); + TRY(hash_last_bytes(flushed_byte_count)); + return flushed_byte_count; +} + +Optional SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length) { VERIFY(minimum_length > 0); @@ -308,71 +332,137 @@ ErrorOr> SearchableCircularBuffer::find_ maximum_length = m_used_space; if (maximum_length < minimum_length) - return Vector {}; + return {}; - Vector matches; + Optional best_match; - // Use memmem to find the initial matches. - size_t haystack_offset_from_start = 0; - Vector haystack; - haystack.append(next_search_span(search_limit())); - if (haystack[0].size() < search_limit()) - haystack.append(next_search_span(search_limit() - haystack[0].size())); + Array needle_storage; + auto needle = needle_storage.span().trim(min(HASH_CHUNK_SIZE, maximum_length)); - auto needle = next_read_span().trim(minimum_length); + { + auto needle_read_bytes = MUST(read_with_seekback(needle, used_space())); + VERIFY(needle_read_bytes.size() == needle.size()); + } - auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); - while (memmem_match.has_value()) { - auto match_offset = memmem_match.release_value(); + // Try an efficient hash-based search first. + if (needle.size() >= HASH_CHUNK_SIZE) { + auto needle_hash = StringView { needle }.hash(); - // Add the match to the list of matches to work with. - TRY(matches.try_empend(m_seekback_limit - used_space() - haystack_offset_from_start - match_offset, minimum_length)); + auto maybe_starting_offset = m_hash_location_map.get(needle_hash); - auto size_to_discard = match_offset + 1; + if (maybe_starting_offset.has_value()) { + Optional previous_buffer_offset; + auto current_buffer_offset = maybe_starting_offset.value(); - // Trim away the already processed bytes from the haystack. - haystack_offset_from_start += size_to_discard; - while (size_to_discard > 0) { - if (haystack[0].size() < size_to_discard) { - size_to_discard -= haystack[0].size(); - haystack.remove(0); - } else { - haystack[0] = haystack[0].slice(size_to_discard); - break; + while (true) { + auto current_search_offset = (capacity() + m_reading_head - current_buffer_offset) % capacity(); + + // Validate the hash. In case it is invalid, we can discard the rest of the chain, as the data (and everything older) got updated. + Array hash_chunk_at_offset; + auto hash_chunk_at_offset_span = MUST(read_with_seekback(hash_chunk_at_offset, current_search_offset + used_space())); + VERIFY(hash_chunk_at_offset_span.size() == HASH_CHUNK_SIZE); + auto found_chunk_hash = StringView { hash_chunk_at_offset }.hash(); + if (needle_hash != found_chunk_hash) { + if (!previous_buffer_offset.has_value()) + m_hash_location_map.remove(needle_hash); + else + m_location_chain_map.remove(*previous_buffer_offset); + break; + } + + // Validate the match through the set-distance-based implementation. + auto maybe_new_match = find_copy_in_seekback(Array { current_search_offset }, maximum_length, HASH_CHUNK_SIZE); + + // If we found a match, record it. + // If we haven't found a match, we simply got a hash collision, so skip. + if (maybe_new_match.has_value()) { + auto new_match = maybe_new_match.release_value(); + + if (!best_match.has_value() || best_match->length < new_match.length) { + best_match = new_match; + + // If we already found a result with the best possible length, then stop searching. + if (best_match->length >= maximum_length) + break; + } + } + + // Get the next location with the same hash from the location chain. + auto maybe_next_buffer_offset = m_location_chain_map.get(current_buffer_offset); + + // End of the chain, nothing more to check. + if (!maybe_next_buffer_offset.has_value()) + break; + + previous_buffer_offset = current_buffer_offset; + current_buffer_offset = maybe_next_buffer_offset.release_value(); } + + // If we found a match, return it now. + if (best_match.has_value()) + return best_match; } - - if (haystack.size() == 0) - break; - - // Try and find the next match. - memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); } - // From now on, all matches that we have stored have at least a length of `minimum_length` and they all refer to the same value. - // For the remaining part, we will keep checking the next byte incrementally and keep eliminating matches until we eliminated all of them. - Vector next_matches; + // Try a plain memory search for smaller values. + // Note: This overlaps with the hash search for chunks of size HASH_CHUNK_SIZE for the purpose of validation. + if (minimum_length <= HASH_CHUNK_SIZE) { + size_t haystack_offset_from_start = 0; + Vector haystack; + haystack.append(next_search_span(search_limit())); + if (haystack[0].size() < search_limit()) + haystack.append(next_search_span(search_limit() - haystack[0].size())); - for (size_t offset = minimum_length; offset < maximum_length; offset++) { - auto needle_data = m_buffer[(capacity() + m_reading_head + offset) % capacity()]; + // TODO: `memmem` searches the memory in "natural" order, which means that it finds matches with a greater distance first. + // Hash-based searching finds the shortest distances first, which is most likely better for encoding and memory efficiency. + // Look into creating a `memmem_reverse`, which starts searching from the end. + auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); + while (memmem_match.has_value()) { + auto match_offset = memmem_match.release_value(); + auto corrected_match_distance = search_limit() - haystack_offset_from_start - match_offset; - for (auto const& match : matches) { - auto haystack_data = m_buffer[(capacity() + m_reading_head - match.distance + offset) % capacity()]; + // Validate the match through the set-distance-based implementation and extend it to the largest size possible. + auto maybe_new_match = find_copy_in_seekback(Array { corrected_match_distance }, min(maximum_length, HASH_CHUNK_SIZE), minimum_length); - if (haystack_data != needle_data) - continue; + // If we weren't able to validate the match at all, either our memmem search returned garbage or our validation function is incorrect. Investigate. + VERIFY(maybe_new_match.has_value()); - TRY(next_matches.try_empend(match.distance, match.length + 1)); + auto new_match = maybe_new_match.release_value(); + + if (!best_match.has_value() || best_match->length < new_match.length) { + best_match = new_match; + + // If we already found a result with the best possible length, then stop searching. + if (best_match->length >= maximum_length) + break; + } + + auto size_to_discard = match_offset + 1; + + // Trim away the already processed bytes from the haystack. + haystack_offset_from_start += size_to_discard; + while (size_to_discard > 0) { + if (haystack[0].size() < size_to_discard) { + size_to_discard -= haystack[0].size(); + haystack.remove(0); + } else { + haystack[0] = haystack[0].slice(size_to_discard); + break; + } + } + + if (haystack.size() == 0) + break; + + // Try and find the next match. + memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); } - if (next_matches.size() == 0) - return matches; - - swap(matches, next_matches); - next_matches.clear_with_capacity(); + // If we found a match of size HASH_CHUNK_SIZE, we should have already found that using the hash search. Investigate. + VERIFY(!best_match.has_value() || best_match->length < HASH_CHUNK_SIZE); } - return matches; + return best_match; } Optional SearchableCircularBuffer::find_copy_in_seekback(ReadonlySpan distances, size_t maximum_length, size_t minimum_length) const @@ -422,4 +512,75 @@ Optional SearchableCircularBuffer::find_copy_in return best_match; } +ErrorOr SearchableCircularBuffer::insert_location_hash(ReadonlyBytes value, size_t raw_offset) +{ + VERIFY(value.size() == HASH_CHUNK_SIZE); + + auto value_hash = StringView { value }.hash(); + + // Discard any old entries for this offset first. This should eliminate accidental loops by breaking the chain. + // The actual cleanup is done on access, since we can only remove invalid references when actually walking the chain. + m_location_chain_map.remove(raw_offset); + + // Check if we have any existing entries for this hash. + // If so, we need to add it to the location chain map instead, as we will soon replace the entry in the hash location map. + auto existing_entry = m_hash_location_map.get(value_hash); + + if (existing_entry.has_value()) + TRY(m_location_chain_map.try_set(raw_offset, existing_entry.value())); + + TRY(m_hash_location_map.try_set(value_hash, raw_offset)); + + return {}; +} + +ErrorOr SearchableCircularBuffer::hash_last_bytes(size_t count) +{ + // Stop early if we don't have enough data overall to hash a full chunk. + if (search_limit() < HASH_CHUNK_SIZE) + return {}; + + auto remaining_recalculations = count; + while (remaining_recalculations > 0) { + // Note: We offset everything by HASH_CHUNK_SIZE because we have up to HASH_CHUNK_SIZE - 1 bytes that we couldn't hash before (as we had missing data). + // The number of recalculations stays the same, since we now have up to HASH_CHUNK_SIZE - 1 bytes that we can't hash now. + auto recalculation_span = next_search_span(min(remaining_recalculations + HASH_CHUNK_SIZE - 1, search_limit())); + + // If the span is smaller than a hash chunk, we need to manually craft some consecutive data to do the hashing. + if (recalculation_span.size() < HASH_CHUNK_SIZE) { + auto auxiliary_span = next_seekback_span(remaining_recalculations); + + // Ensure that our math is correct and that both spans are "adjacent". + VERIFY(recalculation_span.data() + recalculation_span.size() == m_buffer.data() + m_buffer.size()); + VERIFY(auxiliary_span.data() == m_buffer.data()); + + while (recalculation_span.size() > 0 && recalculation_span.size() + auxiliary_span.size() >= HASH_CHUNK_SIZE) { + Array temporary_hash_chunk; + + auto copied_from_recalculation_span = recalculation_span.copy_to(temporary_hash_chunk); + VERIFY(copied_from_recalculation_span == recalculation_span.size()); + + auto copied_from_auxiliary_span = auxiliary_span.copy_to(temporary_hash_chunk.span().slice(copied_from_recalculation_span)); + VERIFY(copied_from_recalculation_span + copied_from_auxiliary_span == HASH_CHUNK_SIZE); + + TRY(insert_location_hash(temporary_hash_chunk, recalculation_span.data() - m_buffer.data())); + + recalculation_span = recalculation_span.slice(1); + remaining_recalculations--; + } + + continue; + } + + for (size_t i = 0; i + HASH_CHUNK_SIZE <= recalculation_span.size(); i++) { + auto value = recalculation_span.slice(i, HASH_CHUNK_SIZE); + auto raw_offset = value.data() - m_buffer.data(); + TRY(insert_location_hash(value, raw_offset)); + remaining_recalculations--; + } + } + + return {}; +} + } diff --git a/AK/CircularBuffer.h b/AK/CircularBuffer.h index 52a14b73906..0323b1afcc6 100644 --- a/AK/CircularBuffer.h +++ b/AK/CircularBuffer.h @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -67,6 +68,11 @@ public: [[nodiscard]] size_t search_limit() const; + // These functions update the read pointer, so we need to hash any data that we have processed. + ErrorOr read(Bytes bytes); + ErrorOr discard(size_t discarded_bytes); + ErrorOr flush_to_stream(Stream& stream); + struct Match { size_t distance; size_t length; @@ -74,15 +80,26 @@ public: /// This searches the seekback buffer (between read head and limit) for occurrences where it matches the next `length` bytes from the read buffer. /// Supplying any hints will only consider those distances, in case existing offsets need to be validated. /// Note that, since we only start searching at the read head, the length between read head and write head is excluded from the distance. - ErrorOr> find_copy_in_seekback(size_t maximum_length, size_t minimum_length = 2) const; + Optional find_copy_in_seekback(size_t maximum_length, size_t minimum_length = 2); Optional find_copy_in_seekback(ReadonlySpan distances, size_t maximum_length, size_t minimum_length = 2) const; + // The chunk size for which the hash table holds hashes. + // This is nice for users to know, as picking a minimum match length that is + // equal or greater than this allows us to completely skip a slow memory search. + static constexpr size_t HASH_CHUNK_SIZE = 3; + private: // Note: This function has a similar purpose as next_seekback_span, but they differ in their reference point. // Seekback operations start counting their distance at the write head, while search operations start counting their distance at the read head. [[nodiscard]] ReadonlyBytes next_search_span(size_t distance) const; SearchableCircularBuffer(ByteBuffer); + + HashMap m_hash_location_map; + HashMap m_location_chain_map; + + ErrorOr insert_location_hash(ReadonlyBytes value, size_t raw_offset); + ErrorOr hash_last_bytes(size_t count); }; } diff --git a/Tests/AK/TestCircularBuffer.cpp b/Tests/AK/TestCircularBuffer.cpp index 43e26bf3b95..3ee57e0f026 100644 --- a/Tests/AK/TestCircularBuffer.cpp +++ b/Tests/AK/TestCircularBuffer.cpp @@ -342,72 +342,62 @@ TEST_CASE(find_copy_in_seekback) auto written_needle_bytes = buffer.write(needle); VERIFY(written_needle_bytes == needle.size()); + // Note: As of now, the preference during a tie is determined by which algorithm found the match. + // Hash-based matching finds the shortest distance first, while memmem finds the greatest distance first. + // A matching TODO can be found in CircularBuffer.cpp. + { - // Find the largest matches with a length between 1 and 1 (all "A"). - auto matches = MUST(buffer.find_copy_in_seekback(1, 1)); - EXPECT_EQ(matches.size(), 4ul); - EXPECT_EQ(matches[0].distance, 11ul); - EXPECT_EQ(matches[0].length, 1ul); - EXPECT_EQ(matches[1].distance, 9ul); - EXPECT_EQ(matches[1].length, 1ul); - EXPECT_EQ(matches[2].distance, 6ul); - EXPECT_EQ(matches[2].length, 1ul); - EXPECT_EQ(matches[3].distance, 2ul); - EXPECT_EQ(matches[3].length, 1ul); + // Find the largest match with a length between 1 and 1 (all "A"). + auto match = buffer.find_copy_in_seekback(1, 1); + EXPECT(match.has_value()); + EXPECT_EQ(match.value().distance, 11ul); + EXPECT_EQ(match.value().length, 1ul); } { - // Find the largest matches with a length between 1 and 2 (all "AB", everything smaller gets eliminated). - auto matches = MUST(buffer.find_copy_in_seekback(2, 1)); - EXPECT_EQ(matches.size(), 4ul); - EXPECT_EQ(matches[0].distance, 11ul); - EXPECT_EQ(matches[0].length, 2ul); - EXPECT_EQ(matches[1].distance, 9ul); - EXPECT_EQ(matches[1].length, 2ul); - EXPECT_EQ(matches[2].distance, 6ul); - EXPECT_EQ(matches[2].length, 2ul); - EXPECT_EQ(matches[3].distance, 2ul); - EXPECT_EQ(matches[3].length, 2ul); + // Find the largest match with a length between 1 and 2 (all "AB", everything smaller gets eliminated). + auto match = buffer.find_copy_in_seekback(2, 1); + EXPECT(match.has_value()); + EXPECT_EQ(match.value().distance, 11ul); + EXPECT_EQ(match.value().length, 2ul); } { - // Find the largest matches with a length between 1 and 3 (all "ABC", everything smaller gets eliminated). - auto matches = MUST(buffer.find_copy_in_seekback(3, 1)); - EXPECT_EQ(matches.size(), 2ul); - EXPECT_EQ(matches[0].distance, 9ul); - EXPECT_EQ(matches[0].length, 3ul); - EXPECT_EQ(matches[1].distance, 6ul); - EXPECT_EQ(matches[1].length, 3ul); + // Find the largest match with a length between 1 and 3 (all "ABC", everything smaller gets eliminated). + auto match = buffer.find_copy_in_seekback(3, 1); + EXPECT(match.has_value()); + EXPECT_EQ(match.value().distance, 6ul); + EXPECT_EQ(match.value().length, 3ul); } { - // Find the largest matches with a length between 1 and 4 (all "ABCD", everything smaller gets eliminated). - auto matches = MUST(buffer.find_copy_in_seekback(4, 1)); - EXPECT_EQ(matches.size(), 1ul); - EXPECT_EQ(matches[0].distance, 6ul); - EXPECT_EQ(matches[0].length, 4ul); + // Find the largest match with a length between 1 and 4 (all "ABCD", everything smaller gets eliminated). + auto match = buffer.find_copy_in_seekback(4, 1); + EXPECT(match.has_value()); + EXPECT_EQ(match.value().distance, 6ul); + EXPECT_EQ(match.value().length, 4ul); } { - // Find the largest matches with a length between 1 and 5 (all "ABCD", everything smaller gets eliminated, and nothing larger exists). - auto matches = MUST(buffer.find_copy_in_seekback(5, 1)); - EXPECT_EQ(matches.size(), 1ul); - EXPECT_EQ(matches[0].distance, 6ul); - EXPECT_EQ(matches[0].length, 4ul); + // Find the largest match with a length between 1 and 5 (all "ABCD", everything smaller gets eliminated, and nothing larger exists). + auto match = buffer.find_copy_in_seekback(5, 1); + EXPECT(match.has_value()); + EXPECT_EQ(match.value().distance, 6ul); + EXPECT_EQ(match.value().length, 4ul); } { - // Find the largest matches with a length between 4 and 5 (all "ABCD", everything smaller never gets found, nothing larger exists). - auto matches = MUST(buffer.find_copy_in_seekback(5, 4)); - EXPECT_EQ(matches.size(), 1ul); - EXPECT_EQ(matches[0].distance, 6ul); - EXPECT_EQ(matches[0].length, 4ul); + // Find the largest match with a length between 4 and 5 (all "ABCD", everything smaller never gets found, nothing larger exists). + auto match = buffer.find_copy_in_seekback(5, 4); + EXPECT(match.has_value()); + EXPECT_EQ(match.value().distance, 6ul); + EXPECT_EQ(match.value().length, 4ul); } { - // Find the largest matches with a length between 5 and 5 (nothing is found). - auto matches = MUST(buffer.find_copy_in_seekback(5, 5)); - EXPECT_EQ(matches.size(), 0ul); + // Find the largest match with a length between 5 and 5 (nothing is found). + auto match = buffer.find_copy_in_seekback(5, 5); + EXPECT(!match.has_value()); } { @@ -432,8 +422,8 @@ TEST_CASE(find_copy_in_seekback) { // Check that we don't find anything for a minimum length beyond the whole buffer size. - auto matches = MUST(buffer.find_copy_in_seekback(12, 13)); - EXPECT_EQ(matches.size(), 0ul); + auto match = buffer.find_copy_in_seekback(12, 13); + EXPECT(!match.has_value()); } } diff --git a/Userland/Libraries/LibCompress/Lzma.cpp b/Userland/Libraries/LibCompress/Lzma.cpp index 123354e2235..04a0b740129 100644 --- a/Userland/Libraries/LibCompress/Lzma.cpp +++ b/Userland/Libraries/LibCompress/Lzma.cpp @@ -1007,17 +1007,17 @@ ErrorOr LzmaCompressor::encode_once() } // If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets. - auto new_distance_results = TRY(m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset)); + auto new_distance_result = m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset); - if (new_distance_results.size() > 0) { - auto selected_match = new_distance_results[0]; + if (new_distance_result.has_value()) { + auto selected_match = new_distance_result.release_value(); TRY(encode_new_match(selected_match.distance, selected_match.length)); return {}; } // If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal. u8 next_byte { 0 }; - m_dictionary->read({ &next_byte, sizeof(next_byte) }); + TRY(m_dictionary->read({ &next_byte, sizeof(next_byte) })); TRY(encode_literal(next_byte)); return {}; }