123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- /*
- * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
- * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #include <AK/CharacterTypes.h>
- #include <AK/FlyString.h>
- #include <AK/StringBuilder.h>
- #include <AK/Utf16View.h>
- #include <AK/Utf8View.h>
- #include <LibJS/Runtime/AbstractOperations.h>
- #include <LibJS/Runtime/GlobalObject.h>
- #include <LibJS/Runtime/PrimitiveString.h>
- #include <LibJS/Runtime/PropertyKey.h>
- #include <LibJS/Runtime/VM.h>
- #include <LibJS/Runtime/Value.h>
- namespace JS {
- PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
- : m_is_rope(true)
- , m_lhs(&lhs)
- , m_rhs(&rhs)
- {
- }
- PrimitiveString::PrimitiveString(String string)
- : m_utf8_string(move(string))
- {
- }
- PrimitiveString::PrimitiveString(DeprecatedString string)
- : m_deprecated_string(move(string))
- {
- }
- PrimitiveString::PrimitiveString(Utf16String string)
- : m_utf16_string(move(string))
- {
- }
- PrimitiveString::~PrimitiveString()
- {
- if (has_utf8_string())
- vm().string_cache().remove(*m_utf8_string);
- if (has_deprecated_string())
- vm().deprecated_string_cache().remove(*m_deprecated_string);
- }
- void PrimitiveString::visit_edges(Cell::Visitor& visitor)
- {
- Base::visit_edges(visitor);
- if (m_is_rope) {
- visitor.visit(m_lhs);
- visitor.visit(m_rhs);
- }
- }
- bool PrimitiveString::is_empty() const
- {
- if (m_is_rope) {
- // NOTE: We never make an empty rope string.
- return false;
- }
- if (has_utf16_string())
- return m_utf16_string->is_empty();
- if (has_utf8_string())
- return m_utf8_string->is_empty();
- if (has_deprecated_string())
- return m_deprecated_string->is_empty();
- VERIFY_NOT_REACHED();
- }
- String PrimitiveString::utf8_string() const
- {
- resolve_rope_if_needed(EncodingPreference::UTF8);
- if (!has_utf8_string()) {
- if (has_deprecated_string())
- m_utf8_string = MUST(String::from_deprecated_string(*m_deprecated_string));
- else if (has_utf16_string())
- m_utf8_string = m_utf16_string->to_utf8();
- else
- VERIFY_NOT_REACHED();
- }
- return *m_utf8_string;
- }
- StringView PrimitiveString::utf8_string_view() const
- {
- (void)utf8_string();
- return m_utf8_string->bytes_as_string_view();
- }
- DeprecatedString PrimitiveString::deprecated_string() const
- {
- resolve_rope_if_needed(EncodingPreference::UTF8);
- if (!has_deprecated_string()) {
- if (has_utf8_string())
- m_deprecated_string = m_utf8_string->to_deprecated_string();
- else if (has_utf16_string())
- m_deprecated_string = m_utf16_string->to_deprecated_string();
- else
- VERIFY_NOT_REACHED();
- }
- return *m_deprecated_string;
- }
- Utf16String PrimitiveString::utf16_string() const
- {
- resolve_rope_if_needed(EncodingPreference::UTF16);
- if (!has_utf16_string()) {
- if (has_utf8_string()) {
- m_utf16_string = Utf16String::create(m_utf8_string->bytes_as_string_view());
- } else {
- VERIFY(has_deprecated_string());
- m_utf16_string = Utf16String::create(*m_deprecated_string);
- }
- }
- return *m_utf16_string;
- }
- Utf16View PrimitiveString::utf16_string_view() const
- {
- (void)utf16_string();
- return m_utf16_string->view();
- }
- ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
- {
- if (property_key.is_symbol())
- return Optional<Value> {};
- if (property_key.is_string()) {
- if (property_key.as_string() == vm.names.length.as_string()) {
- auto length = utf16_string().length_in_code_units();
- return Value(static_cast<double>(length));
- }
- }
- auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip);
- if (!index.is_index())
- return Optional<Value> {};
- auto str = utf16_string_view();
- auto length = str.length_in_code_units();
- if (length <= index.as_index())
- return Optional<Value> {};
- return create(vm, Utf16String::create(str.substring_view(index.as_index(), 1)));
- }
- NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String string)
- {
- if (string.is_empty())
- return vm.empty_string();
- if (string.length_in_code_units() == 1) {
- u16 code_unit = string.code_unit_at(0);
- if (is_ascii(code_unit))
- return vm.single_ascii_character_string(static_cast<u8>(code_unit));
- }
- return vm.heap().allocate_without_realm<PrimitiveString>(move(string));
- }
- NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, String string)
- {
- if (string.is_empty())
- return vm.empty_string();
- if (auto bytes = string.bytes_as_string_view(); bytes.length() == 1) {
- auto ch = static_cast<u8>(bytes[0]);
- if (is_ascii(ch))
- return vm.single_ascii_character_string(ch);
- }
- auto& string_cache = vm.string_cache();
- if (auto it = string_cache.find(string); it != string_cache.end())
- return *it->value;
- auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
- string_cache.set(move(string), new_string);
- return *new_string;
- }
- NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, FlyString const& string)
- {
- return create(vm, string.to_string());
- }
- NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, StringView string)
- {
- return create(vm, String::from_utf8(string).release_value());
- }
- NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedString string)
- {
- if (string.is_empty())
- return vm.empty_string();
- if (string.length() == 1) {
- auto ch = static_cast<u8>(string.characters()[0]);
- if (is_ascii(ch))
- return vm.single_ascii_character_string(ch);
- }
- auto& string_cache = vm.deprecated_string_cache();
- auto it = string_cache.find(string);
- if (it == string_cache.end()) {
- auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
- string_cache.set(move(string), new_string);
- return *new_string;
- }
- return *it->value;
- }
- NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedFlyString const& string)
- {
- return create(vm, *string.impl());
- }
- NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
- {
- // We're here to concatenate two strings into a new rope string.
- // However, if any of them are empty, no rope is required.
- bool lhs_empty = lhs.is_empty();
- bool rhs_empty = rhs.is_empty();
- if (lhs_empty && rhs_empty)
- return vm.empty_string();
- if (lhs_empty)
- return rhs;
- if (rhs_empty)
- return lhs;
- return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
- }
- void PrimitiveString::resolve_rope_if_needed(EncodingPreference preference) const
- {
- if (!m_is_rope)
- return;
- // This vector will hold all the pieces of the rope that need to be assembled
- // into the resolved string.
- Vector<PrimitiveString const*> pieces;
- // NOTE: We traverse the rope tree without using recursion, since we'd run out of
- // stack space quickly when handling a long sequence of unresolved concatenations.
- Vector<PrimitiveString const*> stack;
- stack.append(m_rhs);
- stack.append(m_lhs);
- while (!stack.is_empty()) {
- auto const* current = stack.take_last();
- if (current->m_is_rope) {
- stack.append(current->m_rhs);
- stack.append(current->m_lhs);
- continue;
- }
- pieces.append(current);
- }
- if (preference == EncodingPreference::UTF16) {
- // The caller wants a UTF-16 string, so we can simply concatenate all the pieces
- // into a UTF-16 code unit buffer and create a Utf16String from it.
- Utf16Data code_units;
- for (auto const* current : pieces)
- code_units.extend(current->utf16_string().string());
- m_utf16_string = Utf16String::create(move(code_units));
- m_is_rope = false;
- m_lhs = nullptr;
- m_rhs = nullptr;
- return;
- }
- // Now that we have all the pieces, we can concatenate them using a StringBuilder.
- StringBuilder builder;
- // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
- PrimitiveString const* previous = nullptr;
- for (auto const* current : pieces) {
- if (!previous) {
- // This is the very first piece, just append it and continue.
- builder.append(current->utf8_string());
- previous = current;
- continue;
- }
- // Get the UTF-8 representations for both strings.
- auto current_string_as_utf8 = current->utf8_string_view();
- auto previous_string_as_utf8 = previous->utf8_string_view();
- // NOTE: Now we need to look at the end of the previous string and the start
- // of the current string, to see if they should be combined into a surrogate.
- // Surrogates encoded as UTF-8 are 3 bytes.
- if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
- builder.append(current_string_as_utf8);
- previous = current;
- continue;
- }
- // Might the previous string end with a UTF-8 encoded surrogate?
- if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
- // If not, just append the current string and continue.
- builder.append(current_string_as_utf8);
- previous = current;
- continue;
- }
- // Might the current string begin with a UTF-8 encoded surrogate?
- if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
- // If not, just append the current string and continue.
- builder.append(current_string_as_utf8);
- previous = current;
- continue;
- }
- auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
- auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
- if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
- builder.append(current_string_as_utf8);
- previous = current;
- continue;
- }
- // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
- builder.trim(3);
- builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
- // Append the remaining part of the current string.
- builder.append(current_string_as_utf8.substring_view(3));
- previous = current;
- }
- m_utf8_string = MUST(builder.to_string());
- m_is_rope = false;
- m_lhs = nullptr;
- m_rhs = nullptr;
- }
- }
|