AK: Construct Strings from StringBuilder without re-allocating the data

Currently, invoking StringBuilder::to_string will re-allocate the string
data to construct the String. This is wasteful both in terms of memory
and speed.

The goal here is to simply hand the string buffer over to String, and
let String take ownership of that buffer. To do this, StringBuilder must
have the same memory layout as Detail::StringData. This layout is just
the members of the StringData class followed by the string itself.

So when a StringBuilder is created, we reserve sizeof(StringData) bytes
at the front of the buffer. StringData can then construct itself into
the buffer with placement new.

Things to note:
* StringData must now be aware of the actual capacity of its buffer, as
  that can be larger than the string size.
* We must take care not to pass ownership of inlined string buffers, as
  these live on the stack.
This commit is contained in:
Timothy Flynn 2024-07-19 15:38:41 -04:00 committed by Andreas Kling
parent 77eef8a8f6
commit 29879a69a4
Notes: github-actions[bot] 2024-07-20 07:31:38 +00:00
9 changed files with 139 additions and 28 deletions

View file

@ -8,6 +8,7 @@
#pragma once
#include <AK/Assertions.h>
#include <AK/Badge.h>
#include <AK/Error.h>
#include <AK/Span.h>
#include <AK/Types.h>
@ -301,6 +302,23 @@ public:
operator ReadonlyBytes() const { return bytes(); }
ALWAYS_INLINE size_t capacity() const { return m_inline ? inline_capacity : m_outline_capacity; }
ALWAYS_INLINE bool is_inline() const { return m_inline; }
struct OutlineBuffer {
Bytes buffer;
size_t capacity { 0 };
};
Optional<OutlineBuffer> leak_outline_buffer(Badge<StringBuilder>)
{
if (m_inline)
return {};
auto buffer = bytes();
m_inline = true;
m_size = 0;
return OutlineBuffer { buffer, capacity() };
}
private:
void move_from(ByteBuffer&& other)

View file

@ -16,6 +16,8 @@ namespace AK {
namespace Detail {
template<size_t inline_capacity>
class ByteBuffer;
class StringData;
}
enum class TrailingCodePointTransformation : u8;

View file

@ -96,6 +96,23 @@ ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
return result;
}
ErrorOr<String> String::from_string_builder(Badge<StringBuilder>, StringBuilder& builder)
{
if (!Utf8View { builder.string_view() }.validate())
return Error::from_string_literal("String::from_string_builder: Input was not valid UTF-8");
String result;
result.replace_with_string_builder(builder);
return result;
}
String String::from_string_builder_without_validation(Badge<StringBuilder>, StringBuilder& builder)
{
String result;
result.replace_with_string_builder(builder);
return result;
}
ErrorOr<String> String::repeated(u32 code_point, size_t count)
{
VERIFY(is_unicode(code_point));

View file

@ -57,6 +57,9 @@ public:
[[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes);
static ErrorOr<String> from_string_builder(Badge<StringBuilder>, StringBuilder&);
[[nodiscard]] static String from_string_builder_without_validation(Badge<StringBuilder>, StringBuilder&);
// Creates a new String from a sequence of UTF-16 encoded code points.
static ErrorOr<String> from_utf16(Utf16View const&);

View file

@ -90,6 +90,19 @@ bool StringBase::operator==(StringBase const& other) const
return bytes() == other.bytes();
}
void StringBase::replace_with_string_builder(StringBuilder& builder)
{
if (builder.length() <= MAX_SHORT_STRING_BYTE_COUNT) {
return replace_with_new_short_string(builder.length(), [&](Bytes buffer) {
builder.string_view().bytes().copy_to(buffer);
});
}
destroy_string();
m_data = &StringData::create_from_string_builder(builder).leak_ref();
}
ErrorOr<Bytes> StringBase::replace_with_uninitialized_buffer(size_t byte_count)
{
if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT)

View file

@ -92,6 +92,8 @@ protected:
callback(buffer);
}
void replace_with_string_builder(StringBuilder&);
// This is not a trivial operation with storage, so it does not belong here. Unfortunately, it
// is impossible to implement it without access to StringData.
ErrorOr<StringBase> substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const;

View file

@ -11,6 +11,7 @@
#include <AK/FlyString.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringData.h>
#include <AK/StringView.h>
#include <AK/UnicodeUtils.h>
#include <AK/Utf16View.h>
@ -18,16 +19,33 @@
namespace AK {
static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData);
static ErrorOr<StringBuilder::Buffer> create_buffer(size_t capacity)
{
StringBuilder::Buffer buffer;
if (capacity > StringBuilder::inline_capacity)
TRY(buffer.try_ensure_capacity(STRING_BASE_PREFIX_SIZE + capacity));
TRY(buffer.try_resize(STRING_BASE_PREFIX_SIZE));
return buffer;
}
ErrorOr<StringBuilder> StringBuilder::create(size_t initial_capacity)
{
StringBuilder builder;
TRY(builder.m_buffer.try_ensure_capacity(initial_capacity));
return builder;
auto buffer = TRY(create_buffer(initial_capacity));
return StringBuilder { move(buffer) };
}
StringBuilder::StringBuilder(size_t initial_capacity)
: m_buffer(MUST(create_buffer(initial_capacity)))
{
}
StringBuilder::StringBuilder(Buffer buffer)
: m_buffer(move(buffer))
{
m_buffer.ensure_capacity(initial_capacity);
}
inline ErrorOr<void> StringBuilder::will_append(size_t size)
@ -47,12 +65,12 @@ inline ErrorOr<void> StringBuilder::will_append(size_t size)
size_t StringBuilder::length() const
{
return m_buffer.size();
return m_buffer.size() - STRING_BASE_PREFIX_SIZE;
}
bool StringBuilder::is_empty() const
{
return m_buffer.is_empty();
return length() == 0;
}
void StringBuilder::trim(size_t count)
@ -122,14 +140,18 @@ ByteString StringBuilder::to_byte_string() const
return ByteString((char const*)data(), length());
}
ErrorOr<String> StringBuilder::to_string() const
ErrorOr<String> StringBuilder::to_string()
{
return String::from_utf8(string_view());
if (m_buffer.is_inline())
return String::from_utf8(string_view());
return String::from_string_builder({}, *this);
}
String StringBuilder::to_string_without_validation() const
String StringBuilder::to_string_without_validation()
{
return String::from_utf8_without_validation(string_view().bytes());
if (m_buffer.is_inline())
return String::from_utf8_without_validation(string_view().bytes());
return String::from_string_builder_without_validation({}, *this);
}
FlyString StringBuilder::to_fly_string_without_validation() const
@ -144,22 +166,22 @@ ErrorOr<FlyString> StringBuilder::to_fly_string() const
u8* StringBuilder::data()
{
return m_buffer.data();
return m_buffer.data() + STRING_BASE_PREFIX_SIZE;
}
u8 const* StringBuilder::data() const
{
return m_buffer.data();
return m_buffer.data() + STRING_BASE_PREFIX_SIZE;
}
StringView StringBuilder::string_view() const
{
return StringView { data(), m_buffer.size() };
return m_buffer.span().slice(STRING_BASE_PREFIX_SIZE);
}
void StringBuilder::clear()
{
m_buffer.clear();
m_buffer.resize(STRING_BASE_PREFIX_SIZE);
}
ErrorOr<void> StringBuilder::try_append_code_point(u32 code_point)
@ -272,4 +294,14 @@ ErrorOr<void> StringBuilder::try_append_escaped_for_json(StringView string)
return {};
}
auto StringBuilder::leak_buffer_for_string_construction(Badge<Detail::StringData>) -> Optional<Buffer::OutlineBuffer>
{
if (auto buffer = m_buffer.leak_outline_buffer({}); buffer.has_value()) {
clear();
return buffer;
}
return {};
}
}

View file

@ -18,6 +18,7 @@ class StringBuilder {
public:
static constexpr size_t inline_capacity = 256;
using Buffer = Detail::ByteBuffer<inline_capacity>;
using OutputType = ByteString;
static ErrorOr<StringBuilder> create(size_t initial_capacity = inline_capacity);
@ -61,8 +62,8 @@ public:
[[nodiscard]] ByteString to_byte_string() const;
[[nodiscard]] String to_string_without_validation() const;
ErrorOr<String> to_string() const;
[[nodiscard]] String to_string_without_validation();
ErrorOr<String> to_string();
[[nodiscard]] FlyString to_fly_string_without_validation() const;
ErrorOr<FlyString> to_fly_string() const;
@ -95,12 +96,16 @@ public:
return {};
}
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction(Badge<Detail::StringData>);
private:
explicit StringBuilder(Buffer);
ErrorOr<void> will_append(size_t);
u8* data();
u8 const* data() const;
Detail::ByteBuffer<inline_capacity> m_buffer;
Buffer m_buffer;
};
}

View file

@ -11,6 +11,7 @@
#include <AK/NonnullRefPtr.h>
#include <AK/RefCounted.h>
#include <AK/StringBase.h>
#include <AK/StringBuilder.h>
#include <AK/kmalloc.h>
namespace AK::Detail {
@ -20,25 +21,39 @@ public:
static ErrorOr<NonnullRefPtr<StringData>> create_uninitialized(size_t byte_count, u8*& buffer)
{
VERIFY(byte_count);
void* slot = malloc(allocation_size_for_string_data(byte_count));
if (!slot) {
auto capacity = allocation_size_for_string_data(byte_count);
void* slot = malloc(capacity);
if (!slot)
return Error::from_errno(ENOMEM);
}
auto new_string_data = adopt_ref(*new (slot) StringData(byte_count));
auto new_string_data = adopt_ref(*new (slot) StringData(byte_count, capacity));
buffer = const_cast<u8*>(new_string_data->bytes().data());
return new_string_data;
}
static NonnullRefPtr<StringData> create_from_string_builder(StringBuilder& builder)
{
auto byte_count = builder.length();
VERIFY(byte_count > MAX_SHORT_STRING_BYTE_COUNT);
auto buffer = builder.leak_buffer_for_string_construction({});
VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
return adopt_ref(*new (buffer->buffer.data()) StringData(byte_count, buffer->capacity));
}
static ErrorOr<NonnullRefPtr<StringData>> create_substring(StringData const& superstring, size_t start, size_t byte_count)
{
// Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
VERIFY(byte_count > MAX_SHORT_STRING_BYTE_COUNT);
void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData));
if (!slot) {
auto capacity = sizeof(StringData) + sizeof(StringData::SubstringData);
void* slot = malloc(capacity);
if (!slot)
return Error::from_errno(ENOMEM);
}
return adopt_ref(*new (slot) StringData(superstring, start, byte_count));
return adopt_ref(*new (slot) StringData(superstring, start, byte_count, capacity));
}
struct SubstringData {
@ -48,7 +63,7 @@ public:
void operator delete(void* ptr)
{
kfree_sized(ptr, allocation_size_for_string_data(static_cast<StringData const*>(ptr)->m_byte_count));
kfree_sized(ptr, static_cast<StringData const*>(ptr)->m_capacity);
}
~StringData()
@ -99,13 +114,15 @@ private:
return sizeof(StringData) + (sizeof(char) * length);
}
explicit StringData(size_t byte_count)
StringData(size_t byte_count, size_t capacity)
: m_byte_count(byte_count)
, m_capacity(capacity)
{
}
StringData(StringData const& superstring, size_t start, size_t byte_count)
StringData(StringData const& superstring, size_t start, size_t byte_count, size_t capacity)
: m_byte_count(byte_count)
, m_capacity(capacity)
, m_substring(true)
{
auto& data = const_cast<SubstringData&>(substring_data());
@ -125,6 +142,8 @@ private:
}
u32 m_byte_count { 0 };
u32 m_capacity { 0 };
mutable unsigned m_hash { 0 };
mutable bool m_has_hash { false };
bool m_substring { false };