AK: Introduce the new String, replacement for DeprecatedString

DeprecatedString (formerly String) has been with us since the start,
and it has served us well. However, it has a number of shortcomings
that I'd like to address.

Some of these issues are hard if not impossible to solve incrementally
inside of DeprecatedString, so instead of doing that, let's build a new
String class and then incrementally move over to it instead.

Problems in DeprecatedString:

- It assumes string allocation never fails. This makes it impossible
  to use in allocation-sensitive contexts, and is the reason we had to
  ban DeprecatedString from the kernel entirely.

- The awkward null state. DeprecatedString can be null. It's different
  from the empty state, although null strings are considered empty.
  All code is immediately nicer when using Optional<DeprecatedString>
  but DeprecatedString came before Optional, which is how we ended up
  like this.

- The encoding of the underlying data is ambiguous. For the most part,
  we use it as if it's always UTF-8, but there have been cases where
  we pass around strings in other encodings (e.g ISO8859-1)

- operator[] and length() are used to iterate over DeprecatedString one
  byte at a time. This is done all over the codebase, and will *not*
  give the right results unless the string is all ASCII.

How we solve these issues in the new String:

- Functions that may allocate now return ErrorOr<String> so that ENOMEM
  errors can be passed to the caller.

- String has no null state. Use Optional<String> when needed.

- String is always UTF-8. This is validated when constructing a String.
  We may need to add a bypass for this in the future, for cases where
  you have a known-good string, but for now: validate all the things!

- There is no operator[] or length(). You can get the underlying data
  with bytes(), but for iterating over code points, you should be using
  an UTF-8 iterator.

Furthermore, it has two nifty new features:

- String implements a small string optimization (SSO) for strings that
  can fit entirely within a pointer. This means up to 3 bytes on 32-bit
  platforms, and 7 bytes on 64-bit platforms. Such small strings will
  not be heap-allocated.

- String can create substrings without making a deep copy of the
  substring. Instead, the superstring gets +1 refcount from the
  substring, and it acts like a view into the superstring. To make
  substrings like this, use the substring_with_shared_superstring() API.

One caveat:

- String does not guarantee that the underlying data is null-terminated
  like DeprecatedString does today. While this was nifty in a handful of
  places where we were calling C functions, it did stand in the way of
  shared-superstring substrings.
This commit is contained in:
Andreas Kling 2022-12-01 13:27:43 +01:00
parent d50b9165cd
commit a3e82eaad3
Notes: sideshowbarker 2024-07-17 03:42:41 +09:00
10 changed files with 616 additions and 1 deletions

View file

@ -15,6 +15,7 @@ set(AK_SOURCES
LexicalPath.cpp
Random.cpp
StackInfo.cpp
String.cpp
StringBuilder.cpp
StringFloatingPointConversions.cpp
StringImpl.cpp

View file

@ -31,6 +31,7 @@ class StringView;
class Time;
class URL;
class FlyString;
class String;
class Utf16View;
class Utf32View;
class Utf8CodePointIterator;
@ -188,6 +189,7 @@ using AK::RefPtr;
using AK::SinglyLinkedList;
using AK::Span;
using AK::StackInfo;
using AK::String;
using AK::StringBuilder;
using AK::StringImpl;
using AK::StringView;

337
AK/String.cpp Normal file
View file

@ -0,0 +1,337 @@
/*
* Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Checked.h>
#include <AK/Format.h>
#include <AK/Memory.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <stdlib.h>
namespace AK {
namespace Detail {
class StringData final : public RefCounted<StringData> {
public:
static ErrorOr<NonnullRefPtr<StringData>> create_uninitialized(size_t, u8*& buffer);
static ErrorOr<NonnullRefPtr<StringData>> create_substring(StringData const& superstring, size_t start, size_t byte_count);
static ErrorOr<NonnullRefPtr<StringData>> from_utf8(char const* utf8_bytes, size_t);
struct SubstringData {
StringData const* superstring { nullptr };
u32 start_offset { 0 };
};
void operator delete(void* ptr);
~StringData();
SubstringData const& substring_data() const
{
return *reinterpret_cast<SubstringData const*>(m_bytes_or_substring_data);
}
// NOTE: There is no guarantee about null-termination.
ReadonlyBytes bytes() const
{
if (m_substring) {
auto const& data = substring_data();
return data.superstring->bytes().slice(data.start_offset, m_byte_count);
}
return { &m_bytes_or_substring_data[0], m_byte_count };
}
StringView bytes_as_string_view() const { return { bytes() }; }
bool operator==(StringData const& other) const
{
return bytes_as_string_view() == other.bytes_as_string_view();
}
unsigned hash() const
{
if (!m_has_hash)
compute_hash();
return m_hash;
}
private:
explicit StringData(size_t byte_count);
StringData(StringData const& superstring, size_t start, size_t byte_count);
void compute_hash() const;
u32 m_byte_count { 0 };
mutable unsigned m_hash { 0 };
mutable bool m_has_hash { false };
bool m_substring { false };
u8 m_bytes_or_substring_data[0];
};
void StringData::operator delete(void* ptr)
{
free(ptr);
}
StringData::StringData(size_t byte_count)
: m_byte_count(byte_count)
{
}
StringData::StringData(StringData const& superstring, size_t start, size_t byte_count)
: m_byte_count(byte_count)
, m_substring(true)
{
auto& data = const_cast<SubstringData&>(substring_data());
data.start_offset = start;
data.superstring = &superstring;
superstring.ref();
}
StringData::~StringData()
{
if (m_substring)
substring_data().superstring->unref();
}
constexpr size_t allocation_size_for_string_data(size_t length)
{
return sizeof(StringData) + (sizeof(char) * length) + sizeof(char);
}
ErrorOr<NonnullRefPtr<StringData>> StringData::create_uninitialized(size_t byte_count, u8*& buffer)
{
VERIFY(byte_count);
void* slot = malloc(allocation_size_for_string_data(byte_count));
if (!slot) {
return Error::from_errno(ENOMEM);
}
auto new_string_data = adopt_ref(*new (slot) StringData(byte_count));
buffer = const_cast<u8*>(new_string_data->bytes().data());
return new_string_data;
}
ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, size_t byte_count)
{
// Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
Utf8View view(StringView(utf8_data, byte_count));
if (!view.validate())
return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8");
VERIFY(utf8_data);
u8* buffer = nullptr;
auto new_string_data = TRY(create_uninitialized(byte_count, buffer));
memcpy(buffer, utf8_data, byte_count * sizeof(char));
return new_string_data;
}
ErrorOr<NonnullRefPtr<StringData>> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count)
{
// Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData));
if (!slot) {
return Error::from_errno(ENOMEM);
}
return adopt_ref(*new (slot) StringData(superstring, start, byte_count));
}
void StringData::compute_hash() const
{
auto bytes = this->bytes();
if (bytes.size() == 0)
m_hash = 0;
else
m_hash = string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
m_has_hash = true;
}
}
String::String(NonnullRefPtr<Detail::StringData> data)
: m_data(&data.leak_ref())
{
}
String::String(ShortString short_string)
: m_short_string(short_string)
{
}
String::String(String const& other)
: m_data(other.m_data)
{
if (!is_short_string())
m_data->ref();
}
String::String(String&& other)
: m_data(exchange(other.m_data, nullptr))
{
}
String& String::operator=(String&& other)
{
m_data = exchange(other.m_data, nullptr);
return *this;
}
String& String::operator=(String const& other)
{
if (&other != this) {
m_data = other.m_data;
if (!is_short_string())
m_data->ref();
}
return *this;
}
String::~String()
{
if (!is_short_string() && m_data)
m_data->unref();
}
String::String()
{
// This is an empty string, it's always short and zero-length.
m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG;
}
ErrorOr<String> String::from_utf8(StringView view)
{
if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) {
ShortString short_string;
if (!view.is_empty())
memcpy(short_string.storage, view.characters_without_null_termination(), view.length());
short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG;
return String { short_string };
}
auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length()));
return String { move(data) };
}
StringView String::bytes_as_string_view() const
{
return StringView(bytes());
}
ReadonlyBytes String::bytes() const
{
if (is_short_string())
return m_short_string.bytes();
return m_data->bytes();
}
bool String::is_empty() const
{
return bytes().size() == 0;
}
ErrorOr<String> String::vformatted(StringView fmtstr, TypeErasedFormatParams& params)
{
StringBuilder builder;
TRY(vformat(builder, fmtstr, params));
return builder.to_string();
}
bool String::operator==(String const& other) const
{
if (is_short_string())
return m_data == other.m_data;
return bytes_as_string_view() == other.bytes_as_string_view();
}
bool String::operator==(StringView other) const
{
return bytes_as_string_view() == other;
}
ErrorOr<String> String::substring_from_byte_offset(size_t start, size_t byte_count) const
{
if (!byte_count)
return String {};
return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
}
ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const
{
if (!byte_count)
return String {};
if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT)
return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) };
}
bool String::operator==(char const* c_string) const
{
return bytes_as_string_view() == c_string;
}
u32 String::hash() const
{
if (is_short_string()) {
auto bytes = this->bytes();
return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
}
return m_data->hash();
}
Utf8View String::code_points() const
{
return Utf8View(bytes_as_string_view());
}
ErrorOr<void> Formatter<String>::format(FormatBuilder& builder, String const& utf8_string)
{
return Formatter<StringView>::format(builder, utf8_string.bytes_as_string_view());
}
ErrorOr<String> String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const
{
return StringUtils::replace(*this, needle, replacement, replace_mode);
}
bool String::is_short_string() const
{
return reinterpret_cast<uintptr_t>(m_data) & SHORT_STRING_FLAG;
}
ReadonlyBytes String::ShortString::bytes() const
{
return { storage, byte_count() };
}
size_t String::ShortString::byte_count() const
{
return byte_count_and_short_string_flag >> 1;
}
unsigned Traits<String>::hash(String const& string)
{
return string.hash();
}
DeprecatedString String::to_deprecated_string() const
{
return DeprecatedString(bytes_as_string_view());
}
ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string)
{
Utf8View view(deprecated_string);
if (!view.validate())
return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8");
return String::from_utf8(deprecated_string.view());
}
}

132
AK/String.h Normal file
View file

@ -0,0 +1,132 @@
/*
* Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Format.h>
#include <AK/Forward.h>
#include <AK/RefCounted.h>
#include <AK/Span.h>
#include <AK/StringView.h>
#include <AK/Traits.h>
#include <AK/Types.h>
namespace AK {
namespace Detail {
class StringData;
}
// String is a strongly owned sequence of Unicode code points encoded as UTF-8.
// The data may or may not be heap-allocated, and may or may not be reference counted.
// There is no guarantee that the underlying bytes are null-terminated.
class String {
public:
// NOTE: For short strings, we avoid heap allocations by storing them in the data pointer slot.
static constexpr size_t MAX_SHORT_STRING_BYTE_COUNT = sizeof(Detail::StringData*) - 1;
String(String const&);
String(String&&);
String& operator=(String&&);
String& operator=(String const&);
~String();
// Creates an empty (zero-length) String.
String();
// Creates a new String from a sequence of UTF-8 encoded code points.
static ErrorOr<String> from_utf8(StringView);
// Creates a substring with a deep copy of the specified data window.
ErrorOr<String> substring_from_byte_offset(size_t start, size_t byte_count) const;
// Creates a substring that strongly references the origin superstring instead of making a deep copy of the data.
ErrorOr<String> substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const;
// Returns an iterable view over the Unicode code points.
[[nodiscard]] Utf8View code_points() const;
// Returns the underlying UTF-8 encoded bytes.
// NOTE: There is no guarantee about null-termination.
[[nodiscard]] ReadonlyBytes bytes() const;
// Returns true if the String is zero-length.
[[nodiscard]] bool is_empty() const;
// Returns a StringView covering the full length of the string. Note that iterating this will go byte-at-a-time, not code-point-at-a-time.
[[nodiscard]] StringView bytes_as_string_view() const;
ErrorOr<String> replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const;
[[nodiscard]] bool operator==(String const&) const;
[[nodiscard]] bool operator!=(String const& other) const { return !(*this == other); }
[[nodiscard]] bool operator==(StringView) const;
[[nodiscard]] bool operator!=(StringView other) const { return !(*this == other); }
[[nodiscard]] bool operator==(char const* cstring) const;
[[nodiscard]] bool operator!=(char const* cstring) const { return !(*this == cstring); }
[[nodiscard]] u32 hash() const;
template<typename T>
static ErrorOr<String> number(T value)
requires IsArithmetic<T>
{
return formatted("{}", value);
}
static ErrorOr<String> vformatted(StringView fmtstr, TypeErasedFormatParams&);
template<typename... Parameters>
static ErrorOr<String> formatted(CheckedFormatString<Parameters...>&& fmtstr, Parameters const&... parameters)
{
VariadicFormatParams variadic_format_parameters { parameters... };
return vformatted(fmtstr.view(), variadic_format_parameters);
}
// NOTE: This is primarily interesting to unit tests.
[[nodiscard]] bool is_short_string() const;
// FIXME: Remove these once all code has been ported to String
[[nodiscard]] DeprecatedString to_deprecated_string() const;
static ErrorOr<String> from_deprecated_string(DeprecatedString const&);
private:
// NOTE: If the least significant bit of the pointer is set, this is a short string.
static constexpr uintptr_t SHORT_STRING_FLAG = 1;
struct ShortString {
ReadonlyBytes bytes() const;
size_t byte_count() const;
// NOTE: This is the byte count shifted left 1 step and or'ed with a 1 (the SHORT_STRING_FLAG)
u8 byte_count_and_short_string_flag { 0 };
u8 storage[MAX_SHORT_STRING_BYTE_COUNT] = { 0 };
};
explicit String(NonnullRefPtr<Detail::StringData>);
explicit String(ShortString);
union {
ShortString m_short_string;
Detail::StringData* m_data { nullptr };
};
};
template<>
struct Traits<String> : public GenericTraits<String> {
static unsigned hash(String const&);
};
template<>
struct Formatter<String> : Formatter<StringView> {
ErrorOr<void> format(FormatBuilder&, String const&);
};
}

View file

@ -8,6 +8,7 @@
#include <AK/Checked.h>
#include <AK/PrintfImplementation.h>
#include <AK/StdLibExtras.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/UnicodeUtils.h>
@ -115,6 +116,11 @@ DeprecatedString StringBuilder::build() const
{
return to_deprecated_string();
}
ErrorOr<String> StringBuilder::to_string() const
{
return String::from_utf8(string_view());
}
#endif
StringView StringBuilder::string_view() const

View file

@ -62,7 +62,9 @@ public:
#ifndef KERNEL
[[nodiscard]] DeprecatedString build() const;
[[nodiscard]] DeprecatedString to_deprecated_string() const;
ErrorOr<String> to_string() const;
#endif
[[nodiscard]] ByteBuffer to_byte_buffer() const;
[[nodiscard]] StringView string_view() const;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2020, Andreas Kling <awesomekling@gmail.com>
* Copyright (c) 2018-2022, Andreas Kling <awesomekling@gmail.com>
* Copyright (c) 2020, Fei Wu <f.eiwu@yahoo.com>
*
* SPDX-License-Identifier: BSD-2-Clause
@ -9,6 +9,7 @@
#include <AK/MemMem.h>
#include <AK/Memory.h>
#include <AK/Optional.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
#include <AK/StringView.h>
@ -533,6 +534,35 @@ DeprecatedString replace(StringView str, StringView needle, StringView replaceme
replaced_string.append(str.substring_view(last_position, str.length() - last_position));
return replaced_string.build();
}
ErrorOr<String> replace(String const& haystack, StringView needle, StringView replacement, ReplaceMode replace_mode)
{
if (haystack.is_empty())
return haystack;
// FIXME: Propagate Vector allocation failures (or do this without putting positions in a vector)
Vector<size_t> positions;
if (replace_mode == ReplaceMode::All) {
positions = haystack.bytes_as_string_view().find_all(needle);
if (!positions.size())
return haystack;
} else {
auto pos = haystack.bytes_as_string_view().find(needle);
if (!pos.has_value())
return haystack;
positions.append(pos.value());
}
StringBuilder replaced_string;
size_t last_position = 0;
for (auto& position : positions) {
replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, position - last_position));
replaced_string.append(replacement);
last_position = position + needle.length();
}
replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, haystack.bytes_as_string_view().length() - last_position));
return replaced_string.to_string();
}
#endif
// TODO: Benchmark against KMP (AK/MemMem.h) and switch over if it's faster for short strings too

View file

@ -103,6 +103,8 @@ DeprecatedString to_titlecase(StringView);
DeprecatedString invert_case(StringView);
DeprecatedString replace(StringView, StringView needle, StringView replacement, ReplaceMode);
ErrorOr<String> replace(String const&, StringView needle, StringView replacement, ReplaceMode);
size_t count(StringView, StringView needle);
}

View file

@ -62,6 +62,7 @@ set(AK_TEST_SOURCES
TestSpan.cpp
TestStack.cpp
TestStdLibExtras.cpp
TestString.cpp
TestStringFloatingPointConversions.cpp
TestStringUtils.cpp
TestStringView.cpp

102
Tests/AK/TestString.cpp Normal file
View file

@ -0,0 +1,102 @@
/*
* Copyright (c) 2022, Andreas Kling <kling@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Try.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h>
TEST_CASE(construct_empty)
{
String empty;
EXPECT(empty.is_empty());
EXPECT_EQ(empty.bytes().size(), 0u);
auto empty2 = MUST(String::from_utf8(""sv));
EXPECT(empty2.is_empty());
EXPECT_EQ(empty, empty2);
EXPECT_EQ(empty, ""sv);
}
TEST_CASE(short_strings)
{
#ifdef AK_ARCH_64_BIT
auto string = MUST(String::from_utf8("abcdefg"sv));
EXPECT_EQ(string.is_short_string(), true);
EXPECT_EQ(string.bytes().size(), 7u);
EXPECT_EQ(string.bytes_as_string_view(), "abcdefg"sv);
#else
auto string = MUST(String::from_utf8("abc"sv));
EXPECT_EQ(string.is_short_string(), true);
EXPECT_EQ(string.bytes().size(), 3u);
EXPECT_EQ(string.bytes_as_string_view(), "abc"sv);
#endif
}
TEST_CASE(long_strings)
{
auto string = MUST(String::from_utf8("abcdefgh"sv));
EXPECT_EQ(string.is_short_string(), false);
EXPECT_EQ(string.bytes().size(), 8u);
EXPECT_EQ(string.bytes_as_string_view(), "abcdefgh"sv);
}
TEST_CASE(substring)
{
auto superstring = MUST(String::from_utf8("Hello I am a long string"sv));
auto short_substring = MUST(superstring.substring_from_byte_offset(0, 5));
EXPECT_EQ(short_substring, "Hello"sv);
auto long_substring = MUST(superstring.substring_from_byte_offset(0, 10));
EXPECT_EQ(long_substring, "Hello I am"sv);
}
TEST_CASE(code_points)
{
auto string = MUST(String::from_utf8("🦬🪒"sv));
Vector<u32> code_points;
for (auto code_point : string.code_points())
code_points.append(code_point);
EXPECT_EQ(code_points[0], 0x1f9acu);
EXPECT_EQ(code_points[1], 0x1fa92u);
}
TEST_CASE(string_builder)
{
StringBuilder builder;
builder.append_code_point(0x1f9acu);
builder.append_code_point(0x1fa92u);
auto string = MUST(builder.to_string());
EXPECT_EQ(string, "🦬🪒"sv);
EXPECT_EQ(string.bytes().size(), 8u);
}
TEST_CASE(ak_format)
{
auto foo = MUST(String::formatted("Hello {}", MUST(String::from_utf8("friends"sv))));
EXPECT_EQ(foo, "Hello friends"sv);
}
TEST_CASE(replace)
{
{
auto haystack = MUST(String::from_utf8("Hello enemies"sv));
auto result = MUST(haystack.replace("enemies"sv, "friends"sv, ReplaceMode::All));
EXPECT_EQ(result, "Hello friends"sv);
}
{
auto base_title = MUST(String::from_utf8("anon@courage:~"sv));
auto result = MUST(base_title.replace("[*]"sv, "(*)"sv, ReplaceMode::FirstOnly));
EXPECT_EQ(result, "anon@courage:~"sv);
}
}