From 1d4f287582f43acfd76661d20bc2915455809f41 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 11 Jan 2023 08:26:49 -0500 Subject: [PATCH] AK: Implement FlyString for the new String class This implements a FlyString that will de-duplicate String instances. The FlyString will store the raw encoded data of the String instance: If the String is a short string, FlyString holds the String::ShortString bytes; otherwise FlyString holds a pointer to the Detail::StringData. FlyString itself does not know about String's storage or how to refcount its Detail::StringData. It defers to String to implement these details. --- AK/CMakeLists.txt | 1 + AK/FlyString.cpp | 169 +++++++++++++++++++++++++++++++++++++ AK/FlyString.h | 73 ++++++++++++++++ AK/Forward.h | 1 + AK/String.cpp | 65 +++++++++++++- AK/String.h | 16 ++++ Tests/AK/CMakeLists.txt | 1 + Tests/AK/TestFlyString.cpp | 120 ++++++++++++++++++++++++++ 8 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 AK/FlyString.cpp create mode 100644 AK/FlyString.h create mode 100644 Tests/AK/TestFlyString.cpp diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt index 778593085eb..cd2bcd0e026 100644 --- a/AK/CMakeLists.txt +++ b/AK/CMakeLists.txt @@ -5,6 +5,7 @@ set(AK_SOURCES DeprecatedFlyString.cpp DeprecatedString.cpp FloatingPointStringConversions.cpp + FlyString.cpp Format.cpp FuzzyMatch.cpp GenericLexer.cpp diff --git a/AK/FlyString.cpp b/AK/FlyString.cpp new file mode 100644 index 00000000000..a475ea39999 --- /dev/null +++ b/AK/FlyString.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include + +namespace AK { + +static auto& all_fly_strings() +{ + static Singleton> table; + return *table; +} + +FlyString::FlyString() + : m_data(String {}.to_fly_string_data({})) +{ +} + +FlyString::~FlyString() +{ + String::unref_fly_string_data({}, m_data); +} + +ErrorOr FlyString::from_utf8(StringView string) +{ + return FlyString { TRY(String::from_utf8(string)) }; +} + +FlyString::FlyString(String const& string) +{ + if (string.is_short_string()) { + m_data = string.to_fly_string_data({}); + return; + } + + auto it = all_fly_strings().find(string.bytes_as_string_view()); + if (it == all_fly_strings().end()) { + m_data = string.to_fly_string_data({}); + + all_fly_strings().set(string.bytes_as_string_view(), m_data); + string.did_create_fly_string({}); + } else { + m_data = it->value; + } + + String::ref_fly_string_data({}, m_data); +} + +FlyString::FlyString(FlyString const& other) + : m_data(other.m_data) +{ + String::ref_fly_string_data({}, m_data); +} + +FlyString& FlyString::operator=(FlyString const& other) +{ + if (this != &other) { + m_data = other.m_data; + String::ref_fly_string_data({}, m_data); + } + + return *this; +} + +FlyString::FlyString(FlyString&& other) + : m_data(other.m_data) +{ + other.m_data = String {}.to_fly_string_data({}); +} + +FlyString& FlyString::operator=(FlyString&& other) +{ + m_data = other.m_data; + other.m_data = String {}.to_fly_string_data({}); + + return *this; +} + +bool FlyString::is_empty() const +{ + return bytes_as_string_view().is_empty(); +} + +unsigned FlyString::hash() const +{ + return bytes_as_string_view().hash(); +} + +FlyString::operator String() const +{ + return to_string(); +} + +String FlyString::to_string() const +{ + return String::fly_string_data_to_string({}, m_data); +} + +Utf8View FlyString::code_points() const +{ + return Utf8View { bytes_as_string_view() }; +} + +ReadonlyBytes FlyString::bytes() const +{ + return bytes_as_string_view().bytes(); +} + +StringView FlyString::bytes_as_string_view() const +{ + return String::fly_string_data_to_string_view({}, m_data); +} + +bool FlyString::operator==(FlyString const& other) const +{ + return m_data == other.m_data; +} + +bool FlyString::operator==(String const& other) const +{ + if (m_data == other.to_fly_string_data({})) + return true; + + return bytes_as_string_view() == other.bytes_as_string_view(); +} + +bool FlyString::operator==(StringView string) const +{ + return bytes_as_string_view() == string; +} + +bool FlyString::operator==(char const* string) const +{ + return bytes_as_string_view() == string; +} + +void FlyString::did_destroy_fly_string_data(Badge, StringView string_data) +{ + all_fly_strings().remove(string_data); +} + +uintptr_t FlyString::data(Badge) const +{ + return m_data; +} + +size_t FlyString::number_of_fly_strings() +{ + return all_fly_strings().size(); +} + +unsigned Traits::hash(FlyString const& fly_string) +{ + return fly_string.bytes_as_string_view().hash(); +} + +ErrorOr Formatter::format(FormatBuilder& builder, FlyString const& fly_string) +{ + return Formatter::format(builder, fly_string.bytes_as_string_view()); +} + +} diff --git a/AK/FlyString.h b/AK/FlyString.h new file mode 100644 index 00000000000..1052c64ba42 --- /dev/null +++ b/AK/FlyString.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace AK { + +class FlyString { +public: + FlyString(); + ~FlyString(); + + static ErrorOr from_utf8(StringView); + explicit FlyString(String const&); + + FlyString(FlyString const&); + FlyString& operator=(FlyString const&); + + FlyString(FlyString&&); + FlyString& operator=(FlyString&&); + + [[nodiscard]] bool is_empty() const; + [[nodiscard]] unsigned hash() const; + + explicit operator String() const; + String to_string() const; + + [[nodiscard]] Utf8View code_points() const; + [[nodiscard]] ReadonlyBytes bytes() const; + [[nodiscard]] StringView bytes_as_string_view() const; + + [[nodiscard]] bool operator==(FlyString const& other) const; + [[nodiscard]] bool operator==(String const&) const; + [[nodiscard]] bool operator==(StringView) const; + [[nodiscard]] bool operator==(char const*) const; + + static void did_destroy_fly_string_data(Badge, StringView); + [[nodiscard]] uintptr_t data(Badge) const; + + // This is primarily interesting to unit tests. + [[nodiscard]] static size_t number_of_fly_strings(); + +private: + // This will hold either the pointer to the Detail::StringData it represents or the raw bytes of + // an inlined short string. + uintptr_t m_data { 0 }; +}; + +template<> +struct Traits : public GenericTraits { + static unsigned hash(FlyString const&); +}; + +template<> +struct Formatter : Formatter { + ErrorOr format(FormatBuilder&, FlyString const&); +}; + +} + +#if USING_AK_GLOBALLY +using AK::FlyString; +#endif diff --git a/AK/Forward.h b/AK/Forward.h index 6805f659b12..e6db52c42db 100644 --- a/AK/Forward.h +++ b/AK/Forward.h @@ -21,6 +21,7 @@ class Bitmap; using ByteBuffer = Detail::ByteBuffer<32>; class CircularBuffer; class Error; +class FlyString; class GenericLexer; class IPv4Address; class JsonArray; diff --git a/AK/String.cpp b/AK/String.cpp index 2243968b3da..828adcdce15 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -59,6 +60,9 @@ public: return m_hash; } + bool is_fly_string() const { return m_is_fly_string; } + void set_fly_string(bool is_fly_string) { m_is_fly_string = is_fly_string; } + private: explicit StringData(size_t byte_count); StringData(StringData const& superstring, size_t start, size_t byte_count); @@ -69,6 +73,7 @@ private: mutable unsigned m_hash { 0 }; mutable bool m_has_hash { false }; bool m_substring { false }; + bool m_is_fly_string { false }; u8 m_bytes_or_substring_data[0]; }; @@ -95,6 +100,8 @@ StringData::StringData(StringData const& superstring, size_t start, size_t byte_ StringData::~StringData() { + if (m_is_fly_string) + FlyString::did_destroy_fly_string_data({}, bytes_as_string_view()); if (m_substring) substring_data().superstring->unref(); } @@ -255,6 +262,13 @@ bool String::operator==(String const& other) const return bytes_as_string_view() == other.bytes_as_string_view(); } +bool String::operator==(FlyString const& other) const +{ + if (reinterpret_cast(m_data) == other.data({})) + return true; + return bytes_as_string_view() == other.bytes_as_string_view(); +} + bool String::operator==(StringView other) const { return bytes_as_string_view() == other; @@ -307,7 +321,7 @@ ErrorOr String::replace(StringView needle, StringView replacement, Repla bool String::is_short_string() const { - return reinterpret_cast(m_data) & SHORT_STRING_FLAG; + return has_short_string_bit(reinterpret_cast(m_data)); } ReadonlyBytes String::ShortString::bytes() const @@ -325,6 +339,55 @@ unsigned Traits::hash(String const& string) return string.hash(); } +String String::fly_string_data_to_string(Badge, uintptr_t const& data) +{ + if (has_short_string_bit(data)) + return String { *reinterpret_cast(&data) }; + + auto const* string_data = reinterpret_cast(data); + return String { NonnullRefPtr(*string_data) }; +} + +StringView String::fly_string_data_to_string_view(Badge, uintptr_t const& data) +{ + if (has_short_string_bit(data)) { + auto const* short_string = reinterpret_cast(&data); + return short_string->bytes(); + } + + auto const* string_data = reinterpret_cast(data); + return string_data->bytes_as_string_view(); +} + +uintptr_t String::to_fly_string_data(Badge) const +{ + return reinterpret_cast(m_data); +} + +void String::ref_fly_string_data(Badge, uintptr_t data) +{ + if (has_short_string_bit(data)) + return; + + auto const* string_data = reinterpret_cast(data); + string_data->ref(); +} + +void String::unref_fly_string_data(Badge, uintptr_t data) +{ + if (has_short_string_bit(data)) + return; + + auto const* string_data = reinterpret_cast(data); + string_data->unref(); +} + +void String::did_create_fly_string(Badge) const +{ + VERIFY(!is_short_string()); + m_data->set_fly_string(true); +} + DeprecatedString String::to_deprecated_string() const { return DeprecatedString(bytes_as_string_view()); diff --git a/AK/String.h b/AK/String.h index 7a8eecce15c..b0d4f3e2a12 100644 --- a/AK/String.h +++ b/AK/String.h @@ -73,6 +73,9 @@ public: [[nodiscard]] bool operator==(String const&) const; [[nodiscard]] bool operator!=(String const& other) const { return !(*this == other); } + [[nodiscard]] bool operator==(FlyString const&) const; + [[nodiscard]] bool operator!=(FlyString const& other) const { return !(*this == other); } + [[nodiscard]] bool operator==(StringView) const; [[nodiscard]] bool operator!=(StringView other) const { return !(*this == other); } @@ -102,6 +105,14 @@ public: // NOTE: This is primarily interesting to unit tests. [[nodiscard]] bool is_short_string() const; + [[nodiscard]] static String fly_string_data_to_string(Badge, uintptr_t const&); + [[nodiscard]] static StringView fly_string_data_to_string_view(Badge, uintptr_t const&); + [[nodiscard]] uintptr_t to_fly_string_data(Badge) const; + + static void ref_fly_string_data(Badge, uintptr_t); + static void unref_fly_string_data(Badge, uintptr_t); + void did_create_fly_string(Badge) const; + // FIXME: Remove these once all code has been ported to String [[nodiscard]] DeprecatedString to_deprecated_string() const; static ErrorOr from_deprecated_string(DeprecatedString const&); @@ -110,6 +121,11 @@ private: // NOTE: If the least significant bit of the pointer is set, this is a short string. static constexpr uintptr_t SHORT_STRING_FLAG = 1; + static constexpr bool has_short_string_bit(uintptr_t data) + { + return (data & SHORT_STRING_FLAG) != 0; + } + struct ShortString { ReadonlyBytes bytes() const; size_t byte_count() const; diff --git a/Tests/AK/CMakeLists.txt b/Tests/AK/CMakeLists.txt index 957304cb778..ef7b72b22c9 100644 --- a/Tests/AK/CMakeLists.txt +++ b/Tests/AK/CMakeLists.txt @@ -30,6 +30,7 @@ set(AK_TEST_SOURCES TestFixedPoint.cpp TestFloatingPoint.cpp TestFloatingPointParsing.cpp + TestFlyString.cpp TestFormat.cpp TestGenericLexer.cpp TestHashFunctions.cpp diff --git a/Tests/AK/TestFlyString.cpp b/Tests/AK/TestFlyString.cpp new file mode 100644 index 00000000000..f2eb60b232d --- /dev/null +++ b/Tests/AK/TestFlyString.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include +#include +#include + +TEST_CASE(empty_string) +{ + FlyString fly {}; + EXPECT(fly.is_empty()); + EXPECT_EQ(fly, ""sv); + + // Short strings do not get stored in the fly string table. + EXPECT_EQ(FlyString::number_of_fly_strings(), 0u); +} + +TEST_CASE(short_string) +{ + FlyString fly1 { MUST(String::from_utf8("foo"sv)) }; + EXPECT_EQ(fly1, "foo"sv); + + FlyString fly2 { MUST(String::from_utf8("foo"sv)) }; + EXPECT_EQ(fly2, "foo"sv); + + FlyString fly3 { MUST(String::from_utf8("bar"sv)) }; + EXPECT_EQ(fly3, "bar"sv); + + EXPECT_EQ(fly1, fly2); + EXPECT_NE(fly1, fly3); + EXPECT_NE(fly2, fly3); + + EXPECT(fly1.to_string().is_short_string()); + EXPECT(fly2.to_string().is_short_string()); + EXPECT(fly3.to_string().is_short_string()); + + // Short strings do not get stored in the fly string table. + EXPECT_EQ(FlyString::number_of_fly_strings(), 0u); +} + +TEST_CASE(long_string) +{ + FlyString fly1 { MUST(String::from_utf8("thisisdefinitelymorethan7bytes"sv)) }; + EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + + FlyString fly2 { MUST(String::from_utf8("thisisdefinitelymorethan7bytes"sv)) }; + EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + + FlyString fly3 { MUST(String::from_utf8("thisisalsoforsuremorethan7bytes"sv)) }; + EXPECT_EQ(fly3, "thisisalsoforsuremorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 2u); + + EXPECT_EQ(fly1, fly2); + EXPECT_NE(fly1, fly3); + EXPECT_NE(fly2, fly3); + + EXPECT(!fly1.to_string().is_short_string()); + EXPECT(!fly2.to_string().is_short_string()); + EXPECT(!fly3.to_string().is_short_string()); +} + +TEST_CASE(from_string_view) +{ + auto fly1 = MUST(FlyString::from_utf8("thisisdefinitelymorethan7bytes"sv)); + EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + + auto fly2 = MUST(FlyString::from_utf8("thisisdefinitelymorethan7bytes"sv)); + EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + + auto fly3 = MUST(FlyString::from_utf8("foo"sv)); + EXPECT_EQ(fly3, "foo"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + + EXPECT_EQ(fly1, fly2); + EXPECT_NE(fly1, fly3); + EXPECT_NE(fly2, fly3); +} + +TEST_CASE(fly_string_keep_string_data_alive) +{ + EXPECT_EQ(FlyString::number_of_fly_strings(), 0u); + { + FlyString fly {}; + { + auto string = MUST(String::from_utf8("thisisdefinitelymorethan7bytes"sv)); + fly = FlyString { string }; + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + } + + EXPECT_EQ(fly, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + } + + EXPECT_EQ(FlyString::number_of_fly_strings(), 0u); +} + +TEST_CASE(moved_fly_string_becomes_empty) +{ + FlyString fly1 {}; + EXPECT(fly1.is_empty()); + + FlyString fly2 { MUST(String::from_utf8("thisisdefinitelymorethan7bytes"sv)) }; + EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); + + fly1 = move(fly2); + + EXPECT(fly2.is_empty()); + EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(FlyString::number_of_fly_strings(), 1u); +}