AK: Use hashing to accelerate searching a CircularBuffer

Author: https://github.com/timschumi Commit: https://github.com/SerenityOS/serenity/commit/60ac254df6 Pull-request: https://github.com/SerenityOS/serenity/pull/19332 Reviewed-by: https://github.com/linusg ✅
2024-11-21 15:10:19 +00:00 · 2023-06-02 13:37:47 +02:00 · 2023-06-02 13:37:47 +02:00 · 60ac254df6 · 2024-07-18 05:01:22 +09:00
commit 60ac254df6
parent 4a10cf1506
4 changed files with 269 additions and 101 deletions
--- a/AK/CircularBuffer.cpp
+++ b/AK/CircularBuffer.cpp
@ -296,10 +296,34 @@ ErrorOr<SearchableCircularBuffer> SearchableCircularBuffer::create_initialized(B

    circular_buffer.m_used_space = circular_buffer.m_buffer.size();

+    for (size_t i = 0; i + HASH_CHUNK_SIZE <= circular_buffer.m_buffer.size(); i++)
+        TRY(circular_buffer.insert_location_hash(circular_buffer.m_buffer.span().slice(i, HASH_CHUNK_SIZE), i));
+
    return circular_buffer;
 }

-ErrorOr<Vector<SearchableCircularBuffer::Match>> SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length) const
+ErrorOr<Bytes> SearchableCircularBuffer::read(Bytes bytes)
+{
+    auto read_bytes_span = CircularBuffer::read(bytes);
+    TRY(hash_last_bytes(read_bytes_span.size()));
+    return read_bytes_span;
+}
+
+ErrorOr<void> SearchableCircularBuffer::discard(size_t discarded_bytes)
+{
+    TRY(CircularBuffer::discard(discarded_bytes));
+    TRY(hash_last_bytes(discarded_bytes));
+    return {};
+}
+
+ErrorOr<size_t> SearchableCircularBuffer::flush_to_stream(Stream& stream)
+{
+    auto flushed_byte_count = TRY(CircularBuffer::flush_to_stream(stream));
+    TRY(hash_last_bytes(flushed_byte_count));
+    return flushed_byte_count;
+}
+
+Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length)
 {
    VERIFY(minimum_length > 0);

@ -308,71 +332,137 @@ ErrorOr<Vector<SearchableCircularBuffer::Match>> SearchableCircularBuffer::find_
        maximum_length = m_used_space;

    if (maximum_length < minimum_length)
-        return Vector<Match> {};
+        return {};

-    Vector<Match> matches;
+    Optional<Match> best_match;

-    // Use memmem to find the initial matches.
-    size_t haystack_offset_from_start = 0;
-    Vector<ReadonlyBytes, 2> haystack;
-    haystack.append(next_search_span(search_limit()));
-    if (haystack[0].size() < search_limit())
-        haystack.append(next_search_span(search_limit() - haystack[0].size()));
+    Array<u8, HASH_CHUNK_SIZE> needle_storage;
+    auto needle = needle_storage.span().trim(min(HASH_CHUNK_SIZE, maximum_length));

-    auto needle = next_read_span().trim(minimum_length);
+    {
+        auto needle_read_bytes = MUST(read_with_seekback(needle, used_space()));
+        VERIFY(needle_read_bytes.size() == needle.size());
+    }

-    auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
-    while (memmem_match.has_value()) {
-        auto match_offset = memmem_match.release_value();
+    // Try an efficient hash-based search first.
+    if (needle.size() >= HASH_CHUNK_SIZE) {
+        auto needle_hash = StringView { needle }.hash();

-        // Add the match to the list of matches to work with.
-        TRY(matches.try_empend(m_seekback_limit - used_space() - haystack_offset_from_start - match_offset, minimum_length));
+        auto maybe_starting_offset = m_hash_location_map.get(needle_hash);

-        auto size_to_discard = match_offset + 1;
+        if (maybe_starting_offset.has_value()) {
+            Optional<size_t> previous_buffer_offset;
+            auto current_buffer_offset = maybe_starting_offset.value();

-        // Trim away the already processed bytes from the haystack.
-        haystack_offset_from_start += size_to_discard;
-        while (size_to_discard > 0) {
-            if (haystack[0].size() < size_to_discard) {
-                size_to_discard -= haystack[0].size();
-                haystack.remove(0);
-            } else {
-                haystack[0] = haystack[0].slice(size_to_discard);
-                break;
+            while (true) {
+                auto current_search_offset = (capacity() + m_reading_head - current_buffer_offset) % capacity();
+
+                // Validate the hash. In case it is invalid, we can discard the rest of the chain, as the data (and everything older) got updated.
+                Array<u8, HASH_CHUNK_SIZE> hash_chunk_at_offset;
+                auto hash_chunk_at_offset_span = MUST(read_with_seekback(hash_chunk_at_offset, current_search_offset + used_space()));
+                VERIFY(hash_chunk_at_offset_span.size() == HASH_CHUNK_SIZE);
+                auto found_chunk_hash = StringView { hash_chunk_at_offset }.hash();
+                if (needle_hash != found_chunk_hash) {
+                    if (!previous_buffer_offset.has_value())
+                        m_hash_location_map.remove(needle_hash);
+                    else
+                        m_location_chain_map.remove(*previous_buffer_offset);
+                    break;
+                }
+
+                // Validate the match through the set-distance-based implementation.
+                auto maybe_new_match = find_copy_in_seekback(Array { current_search_offset }, maximum_length, HASH_CHUNK_SIZE);
+
+                // If we found a match, record it.
+                // If we haven't found a match, we simply got a hash collision, so skip.
+                if (maybe_new_match.has_value()) {
+                    auto new_match = maybe_new_match.release_value();
+
+                    if (!best_match.has_value() || best_match->length < new_match.length) {
+                        best_match = new_match;
+
+                        // If we already found a result with the best possible length, then stop searching.
+                        if (best_match->length >= maximum_length)
+                            break;
+                    }
+                }
+
+                // Get the next location with the same hash from the location chain.
+                auto maybe_next_buffer_offset = m_location_chain_map.get(current_buffer_offset);
+
+                // End of the chain, nothing more to check.
+                if (!maybe_next_buffer_offset.has_value())
+                    break;
+
+                previous_buffer_offset = current_buffer_offset;
+                current_buffer_offset = maybe_next_buffer_offset.release_value();
            }
+
+            // If we found a match, return it now.
+            if (best_match.has_value())
+                return best_match;
        }
-
-        if (haystack.size() == 0)
-            break;
-
-        // Try and find the next match.
-        memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
    }

-    // From now on, all matches that we have stored have at least a length of `minimum_length` and they all refer to the same value.
-    // For the remaining part, we will keep checking the next byte incrementally and keep eliminating matches until we eliminated all of them.
-    Vector<Match> next_matches;
+    // Try a plain memory search for smaller values.
+    // Note: This overlaps with the hash search for chunks of size HASH_CHUNK_SIZE for the purpose of validation.
+    if (minimum_length <= HASH_CHUNK_SIZE) {
+        size_t haystack_offset_from_start = 0;
+        Vector<ReadonlyBytes, 2> haystack;
+        haystack.append(next_search_span(search_limit()));
+        if (haystack[0].size() < search_limit())
+            haystack.append(next_search_span(search_limit() - haystack[0].size()));

-    for (size_t offset = minimum_length; offset < maximum_length; offset++) {
-        auto needle_data = m_buffer[(capacity() + m_reading_head + offset) % capacity()];
+        // TODO: `memmem` searches the memory in "natural" order, which means that it finds matches with a greater distance first.
+        //       Hash-based searching finds the shortest distances first, which is most likely better for encoding and memory efficiency.
+        //       Look into creating a `memmem_reverse`, which starts searching from the end.
+        auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
+        while (memmem_match.has_value()) {
+            auto match_offset = memmem_match.release_value();
+            auto corrected_match_distance = search_limit() - haystack_offset_from_start - match_offset;

-        for (auto const& match : matches) {
-            auto haystack_data = m_buffer[(capacity() + m_reading_head - match.distance + offset) % capacity()];
+            // Validate the match through the set-distance-based implementation and extend it to the largest size possible.
+            auto maybe_new_match = find_copy_in_seekback(Array { corrected_match_distance }, min(maximum_length, HASH_CHUNK_SIZE), minimum_length);

-            if (haystack_data != needle_data)
-                continue;
+            // If we weren't able to validate the match at all, either our memmem search returned garbage or our validation function is incorrect. Investigate.
+            VERIFY(maybe_new_match.has_value());

-            TRY(next_matches.try_empend(match.distance, match.length + 1));
+            auto new_match = maybe_new_match.release_value();
+
+            if (!best_match.has_value() || best_match->length < new_match.length) {
+                best_match = new_match;
+
+                // If we already found a result with the best possible length, then stop searching.
+                if (best_match->length >= maximum_length)
+                    break;
+            }
+
+            auto size_to_discard = match_offset + 1;
+
+            // Trim away the already processed bytes from the haystack.
+            haystack_offset_from_start += size_to_discard;
+            while (size_to_discard > 0) {
+                if (haystack[0].size() < size_to_discard) {
+                    size_to_discard -= haystack[0].size();
+                    haystack.remove(0);
+                } else {
+                    haystack[0] = haystack[0].slice(size_to_discard);
+                    break;
+                }
+            }
+
+            if (haystack.size() == 0)
+                break;
+
+            // Try and find the next match.
+            memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
        }

-        if (next_matches.size() == 0)
-            return matches;
-
-        swap(matches, next_matches);
-        next_matches.clear_with_capacity();
+        // If we found a match of size HASH_CHUNK_SIZE, we should have already found that using the hash search. Investigate.
+        VERIFY(!best_match.has_value() || best_match->length < HASH_CHUNK_SIZE);
    }

-    return matches;
+    return best_match;
 }

 Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in_seekback(ReadonlySpan<size_t> distances, size_t maximum_length, size_t minimum_length) const
@ -422,4 +512,75 @@ Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in
    return best_match;
 }

+ErrorOr<void> SearchableCircularBuffer::insert_location_hash(ReadonlyBytes value, size_t raw_offset)
+{
+    VERIFY(value.size() == HASH_CHUNK_SIZE);
+
+    auto value_hash = StringView { value }.hash();
+
+    // Discard any old entries for this offset first. This should eliminate accidental loops by breaking the chain.
+    // The actual cleanup is done on access, since we can only remove invalid references when actually walking the chain.
+    m_location_chain_map.remove(raw_offset);
+
+    // Check if we have any existing entries for this hash.
+    // If so, we need to add it to the location chain map instead, as we will soon replace the entry in the hash location map.
+    auto existing_entry = m_hash_location_map.get(value_hash);
+
+    if (existing_entry.has_value())
+        TRY(m_location_chain_map.try_set(raw_offset, existing_entry.value()));
+
+    TRY(m_hash_location_map.try_set(value_hash, raw_offset));
+
+    return {};
+}
+
+ErrorOr<void> SearchableCircularBuffer::hash_last_bytes(size_t count)
+{
+    // Stop early if we don't have enough data overall to hash a full chunk.
+    if (search_limit() < HASH_CHUNK_SIZE)
+        return {};
+
+    auto remaining_recalculations = count;
+    while (remaining_recalculations > 0) {
+        // Note: We offset everything by HASH_CHUNK_SIZE because we have up to HASH_CHUNK_SIZE - 1 bytes that we couldn't hash before (as we had missing data).
+        //       The number of recalculations stays the same, since we now have up to HASH_CHUNK_SIZE - 1 bytes that we can't hash now.
+        auto recalculation_span = next_search_span(min(remaining_recalculations + HASH_CHUNK_SIZE - 1, search_limit()));
+
+        // If the span is smaller than a hash chunk, we need to manually craft some consecutive data to do the hashing.
+        if (recalculation_span.size() < HASH_CHUNK_SIZE) {
+            auto auxiliary_span = next_seekback_span(remaining_recalculations);
+
+            // Ensure that our math is correct and that both spans are "adjacent".
+            VERIFY(recalculation_span.data() + recalculation_span.size() == m_buffer.data() + m_buffer.size());
+            VERIFY(auxiliary_span.data() == m_buffer.data());
+
+            while (recalculation_span.size() > 0 && recalculation_span.size() + auxiliary_span.size() >= HASH_CHUNK_SIZE) {
+                Array<u8, HASH_CHUNK_SIZE> temporary_hash_chunk;
+
+                auto copied_from_recalculation_span = recalculation_span.copy_to(temporary_hash_chunk);
+                VERIFY(copied_from_recalculation_span == recalculation_span.size());
+
+                auto copied_from_auxiliary_span = auxiliary_span.copy_to(temporary_hash_chunk.span().slice(copied_from_recalculation_span));
+                VERIFY(copied_from_recalculation_span + copied_from_auxiliary_span == HASH_CHUNK_SIZE);
+
+                TRY(insert_location_hash(temporary_hash_chunk, recalculation_span.data() - m_buffer.data()));
+
+                recalculation_span = recalculation_span.slice(1);
+                remaining_recalculations--;
+            }
+
+            continue;
+        }
+
+        for (size_t i = 0; i + HASH_CHUNK_SIZE <= recalculation_span.size(); i++) {
+            auto value = recalculation_span.slice(i, HASH_CHUNK_SIZE);
+            auto raw_offset = value.data() - m_buffer.data();
+            TRY(insert_location_hash(value, raw_offset));
+            remaining_recalculations--;
+        }
+    }
+
+    return {};
+}
+
 }
--- a/AK/CircularBuffer.h
+++ b/AK/CircularBuffer.h
@ -8,6 +8,7 @@

 #include <AK/ByteBuffer.h>
 #include <AK/Error.h>
+#include <AK/HashMap.h>
 #include <AK/Noncopyable.h>
 #include <AK/Vector.h>

@ -67,6 +68,11 @@ public:

    [[nodiscard]] size_t search_limit() const;

+    // These functions update the read pointer, so we need to hash any data that we have processed.
+    ErrorOr<Bytes> read(Bytes bytes);
+    ErrorOr<void> discard(size_t discarded_bytes);
+    ErrorOr<size_t> flush_to_stream(Stream& stream);
+
    struct Match {
        size_t distance;
        size_t length;
@ -74,15 +80,26 @@ public:
    /// This searches the seekback buffer (between read head and limit) for occurrences where it matches the next `length` bytes from the read buffer.
    /// Supplying any hints will only consider those distances, in case existing offsets need to be validated.
    /// Note that, since we only start searching at the read head, the length between read head and write head is excluded from the distance.
-    ErrorOr<Vector<Match>> find_copy_in_seekback(size_t maximum_length, size_t minimum_length = 2) const;
+    Optional<Match> find_copy_in_seekback(size_t maximum_length, size_t minimum_length = 2);
    Optional<Match> find_copy_in_seekback(ReadonlySpan<size_t> distances, size_t maximum_length, size_t minimum_length = 2) const;

+    // The chunk size for which the hash table holds hashes.
+    // This is nice for users to know, as picking a minimum match length that is
+    // equal or greater than this allows us to completely skip a slow memory search.
+    static constexpr size_t HASH_CHUNK_SIZE = 3;
+
 private:
    // Note: This function has a similar purpose as next_seekback_span, but they differ in their reference point.
    //       Seekback operations start counting their distance at the write head, while search operations start counting their distance at the read head.
    [[nodiscard]] ReadonlyBytes next_search_span(size_t distance) const;

    SearchableCircularBuffer(ByteBuffer);
+
+    HashMap<unsigned, size_t> m_hash_location_map;
+    HashMap<size_t, size_t> m_location_chain_map;
+
+    ErrorOr<void> insert_location_hash(ReadonlyBytes value, size_t raw_offset);
+    ErrorOr<void> hash_last_bytes(size_t count);
 };

 }
--- a/Tests/AK/TestCircularBuffer.cpp
+++ b/Tests/AK/TestCircularBuffer.cpp
@ -342,72 +342,62 @@ TEST_CASE(find_copy_in_seekback)
    auto written_needle_bytes = buffer.write(needle);
    VERIFY(written_needle_bytes == needle.size());

+    // Note: As of now, the preference during a tie is determined by which algorithm found the match.
+    //       Hash-based matching finds the shortest distance first, while memmem finds the greatest distance first.
+    //       A matching TODO can be found in CircularBuffer.cpp.
+
    {
-        // Find the largest matches with a length between 1 and 1 (all "A").
-        auto matches = MUST(buffer.find_copy_in_seekback(1, 1));
-        EXPECT_EQ(matches.size(), 4ul);
-        EXPECT_EQ(matches[0].distance, 11ul);
-        EXPECT_EQ(matches[0].length, 1ul);
-        EXPECT_EQ(matches[1].distance, 9ul);
-        EXPECT_EQ(matches[1].length, 1ul);
-        EXPECT_EQ(matches[2].distance, 6ul);
-        EXPECT_EQ(matches[2].length, 1ul);
-        EXPECT_EQ(matches[3].distance, 2ul);
-        EXPECT_EQ(matches[3].length, 1ul);
+        // Find the largest match with a length between 1 and 1 (all "A").
+        auto match = buffer.find_copy_in_seekback(1, 1);
+        EXPECT(match.has_value());
+        EXPECT_EQ(match.value().distance, 11ul);
+        EXPECT_EQ(match.value().length, 1ul);
    }

    {
-        // Find the largest matches with a length between 1 and 2 (all "AB", everything smaller gets eliminated).
-        auto matches = MUST(buffer.find_copy_in_seekback(2, 1));
-        EXPECT_EQ(matches.size(), 4ul);
-        EXPECT_EQ(matches[0].distance, 11ul);
-        EXPECT_EQ(matches[0].length, 2ul);
-        EXPECT_EQ(matches[1].distance, 9ul);
-        EXPECT_EQ(matches[1].length, 2ul);
-        EXPECT_EQ(matches[2].distance, 6ul);
-        EXPECT_EQ(matches[2].length, 2ul);
-        EXPECT_EQ(matches[3].distance, 2ul);
-        EXPECT_EQ(matches[3].length, 2ul);
+        // Find the largest match with a length between 1 and 2 (all "AB", everything smaller gets eliminated).
+        auto match = buffer.find_copy_in_seekback(2, 1);
+        EXPECT(match.has_value());
+        EXPECT_EQ(match.value().distance, 11ul);
+        EXPECT_EQ(match.value().length, 2ul);
    }

    {
-        // Find the largest matches with a length between 1 and 3 (all "ABC", everything smaller gets eliminated).
-        auto matches = MUST(buffer.find_copy_in_seekback(3, 1));
-        EXPECT_EQ(matches.size(), 2ul);
-        EXPECT_EQ(matches[0].distance, 9ul);
-        EXPECT_EQ(matches[0].length, 3ul);
-        EXPECT_EQ(matches[1].distance, 6ul);
-        EXPECT_EQ(matches[1].length, 3ul);
+        // Find the largest match with a length between 1 and 3 (all "ABC", everything smaller gets eliminated).
+        auto match = buffer.find_copy_in_seekback(3, 1);
+        EXPECT(match.has_value());
+        EXPECT_EQ(match.value().distance, 6ul);
+        EXPECT_EQ(match.value().length, 3ul);
    }

    {
-        // Find the largest matches with a length between 1 and 4 (all "ABCD", everything smaller gets eliminated).
-        auto matches = MUST(buffer.find_copy_in_seekback(4, 1));
-        EXPECT_EQ(matches.size(), 1ul);
-        EXPECT_EQ(matches[0].distance, 6ul);
-        EXPECT_EQ(matches[0].length, 4ul);
+        // Find the largest match with a length between 1 and 4 (all "ABCD", everything smaller gets eliminated).
+        auto match = buffer.find_copy_in_seekback(4, 1);
+        EXPECT(match.has_value());
+        EXPECT_EQ(match.value().distance, 6ul);
+        EXPECT_EQ(match.value().length, 4ul);
    }

    {
-        // Find the largest matches with a length between 1 and 5 (all "ABCD", everything smaller gets eliminated, and nothing larger exists).
-        auto matches = MUST(buffer.find_copy_in_seekback(5, 1));
-        EXPECT_EQ(matches.size(), 1ul);
-        EXPECT_EQ(matches[0].distance, 6ul);
-        EXPECT_EQ(matches[0].length, 4ul);
+        // Find the largest match with a length between 1 and 5 (all "ABCD", everything smaller gets eliminated, and nothing larger exists).
+        auto match = buffer.find_copy_in_seekback(5, 1);
+        EXPECT(match.has_value());
+        EXPECT_EQ(match.value().distance, 6ul);
+        EXPECT_EQ(match.value().length, 4ul);
    }

    {
-        // Find the largest matches with a length between 4 and 5 (all "ABCD", everything smaller never gets found, nothing larger exists).
-        auto matches = MUST(buffer.find_copy_in_seekback(5, 4));
-        EXPECT_EQ(matches.size(), 1ul);
-        EXPECT_EQ(matches[0].distance, 6ul);
-        EXPECT_EQ(matches[0].length, 4ul);
+        // Find the largest match with a length between 4 and 5 (all "ABCD", everything smaller never gets found, nothing larger exists).
+        auto match = buffer.find_copy_in_seekback(5, 4);
+        EXPECT(match.has_value());
+        EXPECT_EQ(match.value().distance, 6ul);
+        EXPECT_EQ(match.value().length, 4ul);
    }

    {
-        // Find the largest matches with a length between 5 and 5 (nothing is found).
-        auto matches = MUST(buffer.find_copy_in_seekback(5, 5));
-        EXPECT_EQ(matches.size(), 0ul);
+        // Find the largest match with a length between 5 and 5 (nothing is found).
+        auto match = buffer.find_copy_in_seekback(5, 5);
+        EXPECT(!match.has_value());
    }

    {
@ -432,8 +422,8 @@ TEST_CASE(find_copy_in_seekback)

    {
        // Check that we don't find anything for a minimum length beyond the whole buffer size.
-        auto matches = MUST(buffer.find_copy_in_seekback(12, 13));
-        EXPECT_EQ(matches.size(), 0ul);
+        auto match = buffer.find_copy_in_seekback(12, 13);
+        EXPECT(!match.has_value());
    }
 }

--- a/Userland/Libraries/LibCompress/Lzma.cpp
+++ b/Userland/Libraries/LibCompress/Lzma.cpp
@ -1007,17 +1007,17 @@ ErrorOr<void> LzmaCompressor::encode_once()
    }

    // If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets.
-    auto new_distance_results = TRY(m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset));
+    auto new_distance_result = m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset);

-    if (new_distance_results.size() > 0) {
-        auto selected_match = new_distance_results[0];
+    if (new_distance_result.has_value()) {
+        auto selected_match = new_distance_result.release_value();
        TRY(encode_new_match(selected_match.distance, selected_match.length));
        return {};
    }

    // If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal.
    u8 next_byte { 0 };
-    m_dictionary->read({ &next_byte, sizeof(next_byte) });
+    TRY(m_dictionary->read({ &next_byte, sizeof(next_byte) }));
    TRY(encode_literal(next_byte));
    return {};
 }