AK: Redesign HashTable to use closed hashing

Instead of each hash bucket being a SinglyLinkedList, switch to using
closed hashing (open addressing). Buckets are chained together via
double hashing (hashing the hash until we find an unused bucket.)

This greatly reduces malloc traffic, since each added element no longer
allocates a new linked list node.

Appears performance neutral on test-js. Can definitely be tuned and
could use proper management of load factor, etc.
This commit is contained in:
Andreas Kling 2020-10-15 23:34:07 +02:00
parent 4387590e65
commit 4e50079f36
Notes: sideshowbarker 2024-07-19 01:53:32 +09:00
2 changed files with 248 additions and 267 deletions

View file

@ -39,6 +39,16 @@ inline unsigned int_hash(u32 key)
return key;
}
inline unsigned double_hash(u32 key)
{
key = ~key + (key >> 23);
key ^= (key << 12);
key ^= (key >> 7);
key ^= (key << 2);
key ^= (key >> 20);
return key;
}
inline unsigned pair_int_hash(u32 key1, u32 key2)
{
return int_hash((int_hash(key1) * 209) ^ (int_hash(key2 * 413)));

View file

@ -26,11 +26,11 @@
#pragma once
#include <AK/Assertions.h>
#include <AK/SinglyLinkedList.h>
#include <AK/HashFunctions.h>
#include <AK/LogStream.h>
#include <AK/StdLibExtras.h>
#include <AK/TemporaryChange.h>
#include <AK/Traits.h>
#include <AK/Types.h>
#include <AK/kmalloc.h>
namespace AK {
@ -39,137 +39,102 @@ enum class HashSetResult {
ReplacedExistingEntry
};
template<typename T, typename>
class HashTable;
template<typename HashTableType, typename ElementType, typename BucketIteratorType>
template<typename HashTableType, typename T, typename BucketType>
class HashTableIterator {
public:
bool operator!=(const HashTableIterator& other) const
{
if (m_is_end && other.m_is_end)
return false;
return &m_table != &other.m_table
|| m_is_end != other.m_is_end
|| m_bucket_index != other.m_bucket_index
|| m_bucket_iterator != other.m_bucket_iterator;
}
bool operator==(const HashTableIterator& other) const { return !(*this != other); }
ElementType& operator*() { return *m_bucket_iterator; }
ElementType* operator->() { return m_bucket_iterator.operator->(); }
HashTableIterator& operator++()
{
skip_to_next();
return *this;
}
void skip_to_next()
{
while (!m_is_end) {
if (m_bucket_iterator.is_end()) {
++m_bucket_index;
if (m_bucket_index >= m_table.capacity()) {
m_is_end = true;
return;
}
m_bucket_iterator = m_table.bucket(m_bucket_index).begin();
} else {
++m_bucket_iterator;
}
if (!m_bucket_iterator.is_end())
return;
}
}
private:
friend HashTableType;
explicit HashTableIterator(HashTableType& table, bool is_end, BucketIteratorType bucket_iterator = {}, size_t bucket_index = 0)
: m_table(table)
, m_bucket_index(bucket_index)
, m_is_end(is_end)
, m_bucket_iterator(bucket_iterator)
public:
bool operator==(const HashTableIterator& other) const { return m_bucket == other.m_bucket; }
bool operator!=(const HashTableIterator& other) const { return m_bucket != other.m_bucket; }
T& operator*() { return *m_bucket->slot(); }
T* operator->() { return m_bucket->slot(); }
void operator++() { skip_to_next(); }
private:
void skip_to_next()
{
ASSERT(!table.m_clearing);
ASSERT(!table.m_rehashing);
if (!is_end && !m_table.is_empty() && m_bucket_iterator.is_end()) {
m_bucket_iterator = m_table.bucket(0).begin();
if (m_bucket_iterator.is_end())
skip_to_next();
}
if (!m_bucket)
return;
do {
++m_bucket;
if (m_bucket->used)
return;
} while (!m_bucket->end);
if (m_bucket->end)
m_bucket = nullptr;
}
HashTableType& m_table;
size_t m_bucket_index { 0 };
bool m_is_end { false };
BucketIteratorType m_bucket_iterator;
explicit HashTableIterator(BucketType* bucket)
: m_bucket(bucket)
{
}
BucketType* m_bucket { nullptr };
};
template<typename T, typename TraitsForT>
class HashTable {
private:
using Bucket = SinglyLinkedList<T>;
struct Bucket {
bool used;
bool deleted;
bool end;
alignas(T) u8 storage[sizeof(T)];
T* slot() { return reinterpret_cast<T*>(storage); }
const T* slot() const { return reinterpret_cast<const T*>(storage); }
};
public:
HashTable() { }
HashTable(size_t capacity)
: m_buckets(new Bucket[capacity])
, m_capacity(capacity)
{
}
HashTable(size_t capacity) { rehash(capacity); }
~HashTable() { clear(); }
HashTable(const HashTable& other)
{
ensure_capacity(other.size());
rehash(other.capacity());
for (auto& it : other)
set(it);
}
HashTable& operator=(const HashTable& other)
{
if (this != &other) {
clear();
ensure_capacity(other.size());
rehash(other.capacity());
for (auto& it : other)
set(it);
}
return *this;
}
HashTable(HashTable&& other)
: m_buckets(other.m_buckets)
, m_size(other.m_size)
, m_capacity(other.m_capacity)
, m_deleted_count(other.m_deleted_count)
{
other.m_size = 0;
other.m_capacity = 0;
other.m_deleted_count = 0;
other.m_buckets = nullptr;
}
HashTable& operator=(HashTable&& other)
{
if (this != &other) {
clear();
m_buckets = other.m_buckets;
m_size = other.m_size;
m_capacity = other.m_capacity;
other.m_size = 0;
other.m_capacity = 0;
other.m_buckets = nullptr;
m_buckets = exchange(other.m_buckets, nullptr);
m_size = exchange(other.m_size, 0);
m_capacity = exchange(other.m_capacity, 0);
m_deleted_count = exchange(other.m_deleted_count, 0);
}
return *this;
}
~HashTable() { clear(); }
bool is_empty() const { return !m_size; }
size_t size() const { return m_size; }
size_t capacity() const { return m_capacity; }
void ensure_capacity(size_t capacity)
{
ASSERT(capacity >= size());
rehash(capacity);
}
HashSetResult set(const T&);
HashSetResult set(T&&);
template<typename U, size_t N>
void set_from(U (&from_array)[N])
{
@ -178,43 +143,93 @@ public:
}
}
bool contains(const T&) const;
void clear();
void ensure_capacity(size_t capacity)
{
ASSERT(capacity >= size());
rehash(capacity * 2);
}
using Iterator = HashTableIterator<HashTable, T, typename Bucket::Iterator>;
friend Iterator;
Iterator begin() { return Iterator(*this, is_empty()); }
Iterator end() { return Iterator(*this, true); }
bool contains(const T& value) const
{
return find(value) != end();
}
using ConstIterator = HashTableIterator<const HashTable, const T, typename Bucket::ConstIterator>;
friend ConstIterator;
ConstIterator begin() const { return ConstIterator(*this, is_empty()); }
ConstIterator end() const { return ConstIterator(*this, true); }
using Iterator = HashTableIterator<HashTable, T, Bucket>;
Iterator begin()
{
for (size_t i = 0; i < m_capacity; ++i) {
if (m_buckets[i].used)
return Iterator(&m_buckets[i]);
}
return end();
}
Iterator end()
{
return Iterator(nullptr);
}
using ConstIterator = HashTableIterator<const HashTable, const T, const Bucket>;
ConstIterator begin() const
{
for (size_t i = 0; i < m_capacity; ++i) {
if (m_buckets[i].used)
return ConstIterator(&m_buckets[i]);
}
return end();
}
ConstIterator end() const
{
return ConstIterator(nullptr);
}
void clear()
{
if (!m_buckets)
return;
for (size_t i = 0; i < m_capacity; ++i) {
if (m_buckets[i].used)
m_buckets[i].slot()->~T();
}
kfree(m_buckets);
m_buckets = nullptr;
m_capacity = 0;
m_size = 0;
m_deleted_count = 0;
}
HashSetResult set(T&& value)
{
auto& bucket = lookup_for_writing(value);
if (bucket.used) {
(*bucket.slot()) = move(value);
return HashSetResult::ReplacedExistingEntry;
}
new (bucket.slot()) T(move(value));
bucket.used = true;
if (bucket.deleted) {
bucket.deleted = false;
--m_deleted_count;
}
++m_size;
return HashSetResult::InsertedNewEntry;
}
HashSetResult set(const T& value)
{
return set(T(value));
}
template<typename Finder>
Iterator find(unsigned hash, Finder finder)
{
if (is_empty())
return end();
size_t bucket_index;
auto& bucket = lookup_with_hash(hash, &bucket_index);
auto bucket_iterator = bucket.find(finder);
if (bucket_iterator != bucket.end())
return Iterator(*this, false, bucket_iterator, bucket_index);
return end();
}
template<typename Finder>
ConstIterator find(unsigned hash, Finder finder) const
{
if (is_empty())
return end();
size_t bucket_index;
auto& bucket = lookup_with_hash(hash, &bucket_index);
auto bucket_iterator = bucket.find(finder);
if (bucket_iterator != bucket.end())
return ConstIterator(*this, false, bucket_iterator, bucket_index);
return end();
return Iterator(lookup_with_hash(hash, move(finder)));
}
Iterator find(const T& value)
@ -222,6 +237,12 @@ public:
return find(TraitsForT::hash(value), [&](auto& other) { return TraitsForT::equals(value, other); });
}
template<typename Finder>
ConstIterator find(unsigned hash, Finder finder) const
{
return ConstIterator(lookup_with_hash(hash, move(finder)));
}
ConstIterator find(const T& value) const
{
return find(TraitsForT::hash(value), [&](auto& other) { return TraitsForT::equals(value, other); });
@ -237,170 +258,120 @@ public:
return false;
}
void remove(Iterator);
void remove(Iterator iterator)
{
ASSERT(iterator.m_bucket);
auto& bucket = *iterator.m_bucket;
ASSERT(bucket.used);
ASSERT(!bucket.end);
ASSERT(!bucket.deleted);
bucket.slot()->~T();
bucket.used = false;
bucket.deleted = true;
--m_size;
++m_deleted_count;
}
private:
Bucket& lookup(const T&, size_t* bucket_index = nullptr);
const Bucket& lookup(const T&, size_t* bucket_index = nullptr) const;
Bucket& lookup_with_hash(unsigned hash, size_t* bucket_index)
void insert_during_rehash(T&& value)
{
if (bucket_index)
*bucket_index = hash % m_capacity;
return m_buckets[hash % m_capacity];
auto& bucket = lookup_for_writing(value);
new (bucket.slot()) T(move(value));
bucket.used = true;
}
const Bucket& lookup_with_hash(unsigned hash, size_t* bucket_index) const
void rehash(size_t new_capacity)
{
if (bucket_index)
*bucket_index = hash % m_capacity;
return m_buckets[hash % m_capacity];
new_capacity = max(new_capacity, static_cast<size_t>(4));
auto* old_buckets = m_buckets;
auto old_capacity = m_capacity;
m_buckets = (Bucket*)kmalloc(sizeof(Bucket) * (new_capacity + 1));
__builtin_memset(m_buckets, 0, sizeof(Bucket) * (new_capacity + 1));
m_capacity = new_capacity;
m_deleted_count = 0;
m_buckets[m_capacity].end = true;
if (!old_buckets)
return;
for (size_t i = 0; i < old_capacity; ++i) {
auto& old_bucket = old_buckets[i];
if (old_bucket.used) {
insert_during_rehash(move(*old_bucket.slot()));
old_bucket.slot()->~T();
}
}
kfree(old_buckets);
}
void rehash(size_t capacity);
void insert(const T&);
void insert(T&&);
template<typename Finder>
Bucket* lookup_with_hash(unsigned hash, Finder finder, Bucket** usable_bucket_for_writing = nullptr) const
{
if (is_empty())
return nullptr;
size_t bucket_index = hash % m_capacity;
for (;;) {
auto& bucket = m_buckets[bucket_index];
Bucket& bucket(size_t index) { return m_buckets[index]; }
const Bucket& bucket(size_t index) const { return m_buckets[index]; }
if (usable_bucket_for_writing && !*usable_bucket_for_writing && !bucket.used) {
*usable_bucket_for_writing = &bucket;
}
if (bucket.used && finder(*bucket.slot()))
return &bucket;
if (!bucket.used && !bucket.deleted)
return nullptr;
hash = double_hash(hash);
bucket_index = hash % m_capacity;
}
}
const Bucket* lookup_for_reading(const T& value) const
{
return lookup_with_hash(TraitsForT::hash(value), [&value](auto& entry) { return TraitsForT::equals(entry, value); });
}
Bucket& lookup_for_writing(const T& value)
{
auto hash = TraitsForT::hash(value);
Bucket* usable_bucket_for_writing = nullptr;
if (auto* bucket_for_reading = lookup_with_hash(
hash,
[&value](auto& entry) { return TraitsForT::equals(entry, value); },
&usable_bucket_for_writing)) {
return *const_cast<Bucket*>(bucket_for_reading);
}
if ((used_bucket_count() + 1) >= m_capacity)
rehash((size() + 1) * 2);
else if (usable_bucket_for_writing)
return *usable_bucket_for_writing;
size_t bucket_index = hash % m_capacity;
for (;;) {
auto& bucket = m_buckets[bucket_index];
if (!bucket.used)
return bucket;
hash = double_hash(hash);
bucket_index = hash % m_capacity;
}
}
size_t used_bucket_count() const { return m_size + m_deleted_count; }
Bucket* m_buckets { nullptr };
size_t m_size { 0 };
size_t m_capacity { 0 };
bool m_clearing { false };
bool m_rehashing { false };
size_t m_deleted_count { 0 };
};
template<typename T, typename TraitsForT>
HashSetResult HashTable<T, TraitsForT>::set(T&& value)
{
if (!m_capacity)
rehash(1);
auto& bucket = lookup(value);
for (auto& e : bucket) {
if (TraitsForT::equals(e, value)) {
e = move(value);
return HashSetResult::ReplacedExistingEntry;
}
}
if (size() >= capacity()) {
rehash(size() + 1);
insert(move(value));
} else {
bucket.append(move(value));
}
m_size++;
return HashSetResult::InsertedNewEntry;
}
template<typename T, typename TraitsForT>
HashSetResult HashTable<T, TraitsForT>::set(const T& value)
{
if (!m_capacity)
rehash(1);
auto& bucket = lookup(value);
for (auto& e : bucket) {
if (TraitsForT::equals(e, value)) {
e = value;
return HashSetResult::ReplacedExistingEntry;
}
}
if (size() >= capacity()) {
rehash(size() + 1);
insert(value);
} else {
bucket.append(value);
}
m_size++;
return HashSetResult::InsertedNewEntry;
}
template<typename T, typename TraitsForT>
void HashTable<T, TraitsForT>::rehash(size_t new_capacity)
{
TemporaryChange<bool> change(m_rehashing, true);
new_capacity *= 2;
auto* new_buckets = new Bucket[new_capacity];
auto* old_buckets = m_buckets;
size_t old_capacity = m_capacity;
m_buckets = new_buckets;
m_capacity = new_capacity;
for (size_t i = 0; i < old_capacity; ++i) {
for (auto& value : old_buckets[i]) {
insert(move(value));
}
}
delete[] old_buckets;
}
template<typename T, typename TraitsForT>
void HashTable<T, TraitsForT>::clear()
{
TemporaryChange<bool> change(m_clearing, true);
if (m_buckets) {
delete[] m_buckets;
m_buckets = nullptr;
}
m_capacity = 0;
m_size = 0;
}
template<typename T, typename TraitsForT>
void HashTable<T, TraitsForT>::insert(T&& value)
{
auto& bucket = lookup(value);
bucket.append(move(value));
}
template<typename T, typename TraitsForT>
void HashTable<T, TraitsForT>::insert(const T& value)
{
auto& bucket = lookup(value);
bucket.append(value);
}
template<typename T, typename TraitsForT>
bool HashTable<T, TraitsForT>::contains(const T& value) const
{
if (is_empty())
return false;
auto& bucket = lookup(value);
for (auto& e : bucket) {
if (TraitsForT::equals(e, value))
return true;
}
return false;
}
template<typename T, typename TraitsForT>
void HashTable<T, TraitsForT>::remove(Iterator it)
{
ASSERT(!is_empty());
m_buckets[it.m_bucket_index].remove(it.m_bucket_iterator);
--m_size;
}
template<typename T, typename TraitsForT>
auto HashTable<T, TraitsForT>::lookup(const T& value, size_t* bucket_index) -> Bucket&
{
unsigned hash = TraitsForT::hash(value);
if (bucket_index)
*bucket_index = hash % m_capacity;
return m_buckets[hash % m_capacity];
}
template<typename T, typename TraitsForT>
auto HashTable<T, TraitsForT>::lookup(const T& value, size_t* bucket_index) const -> const Bucket&
{
unsigned hash = TraitsForT::hash(value);
if (bucket_index)
*bucket_index = hash % m_capacity;
return m_buckets[hash % m_capacity];
}
}
using AK::HashTable;