LibJS: Resolve rope strings directly to UTF-16 when preferable

When someone calls PrimitiveString::utf16_string() on a rope string,
we know for sure that the client wants a UTF-16 string and may not
be interested in a UTF-8 version at all.

To avoid round-tripping through UTF-8 in this scenario, callers can
now inform resolve_rope_if_needed() about their preferred encoding,
should rope resolution take place. The UTF-16 case is actually a lot
simpler than the UTF-8 case, since we can simply ask for UTF-16 data
for each fiber of the rope, and then concatenate all the fibers.

Since LibJS always uses UTF-16 for regular expression matching, this
avoids round-tripping through UTF-8 whenever the input to a regex test
is already UTF-16. :^)
This commit is contained in:
Andreas Kling 2023-07-13 15:10:57 +02:00
parent e78ea08ed9
commit a3e4535f34
Notes: sideshowbarker 2024-07-17 03:05:16 +09:00
2 changed files with 24 additions and 23 deletions

View file

@ -77,7 +77,7 @@ bool PrimitiveString::is_empty() const
ThrowCompletionOr<String> PrimitiveString::utf8_string() const
{
auto& vm = this->vm();
TRY(resolve_rope_if_needed());
TRY(resolve_rope_if_needed(EncodingPreference::UTF8));
if (!has_utf8_string()) {
if (has_deprecated_string())
@ -99,7 +99,7 @@ ThrowCompletionOr<StringView> PrimitiveString::utf8_string_view() const
ThrowCompletionOr<DeprecatedString> PrimitiveString::deprecated_string() const
{
TRY(resolve_rope_if_needed());
TRY(resolve_rope_if_needed(EncodingPreference::UTF8));
if (!has_deprecated_string()) {
if (has_utf8_string())
@ -115,7 +115,7 @@ ThrowCompletionOr<DeprecatedString> PrimitiveString::deprecated_string() const
ThrowCompletionOr<Utf16String> PrimitiveString::utf16_string() const
{
TRY(resolve_rope_if_needed());
TRY(resolve_rope_if_needed(EncodingPreference::UTF16));
if (!has_utf16_string()) {
if (has_utf8_string()) {
@ -245,31 +245,13 @@ NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& l
return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
}
ThrowCompletionOr<void> PrimitiveString::resolve_rope_if_needed() const
ThrowCompletionOr<void> PrimitiveString::resolve_rope_if_needed(EncodingPreference preference) const
{
if (!m_is_rope)
return {};
auto& vm = this->vm();
// NOTE: Special case for two concatenated UTF-16 strings.
// This is here as an optimization, although I'm unsure how valuable it is.
if (m_lhs->has_utf16_string() && m_rhs->has_utf16_string()) {
auto const& lhs_string = m_lhs->m_utf16_string.value();
auto const& rhs_string = m_rhs->m_utf16_string.value();
Utf16Data combined;
TRY_OR_THROW_OOM(vm, combined.try_ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units()));
combined.extend(lhs_string.string());
combined.extend(rhs_string.string());
m_utf16_string = TRY(Utf16String::create(vm, move(combined)));
m_is_rope = false;
m_lhs = nullptr;
m_rhs = nullptr;
return {};
}
// This vector will hold all the pieces of the rope that need to be assembled
// into the resolved string.
Vector<PrimitiveString const*> pieces;
@ -289,6 +271,21 @@ ThrowCompletionOr<void> PrimitiveString::resolve_rope_if_needed() const
TRY_OR_THROW_OOM(vm, pieces.try_append(current));
}
if (preference == EncodingPreference::UTF16) {
// The caller wants a UTF-16 string, so we can simply concatenate all the pieces
// into a UTF-16 code unit buffer and create a Utf16String from it.
Utf16Data code_units;
for (auto const* current : pieces)
code_units.extend(TRY(current->utf16_string()).string());
m_utf16_string = TRY(Utf16String::create(vm, move(code_units)));
m_is_rope = false;
m_lhs = nullptr;
m_rhs = nullptr;
return {};
}
// Now that we have all the pieces, we can concatenate them using a StringBuilder.
ThrowableStringBuilder builder(vm);

View file

@ -59,7 +59,11 @@ private:
virtual void visit_edges(Cell::Visitor&) override;
ThrowCompletionOr<void> resolve_rope_if_needed() const;
enum class EncodingPreference {
UTF8,
UTF16,
};
ThrowCompletionOr<void> resolve_rope_if_needed(EncodingPreference) const;
mutable bool m_is_rope { false };