PrimitiveString.cpp 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Utf16View.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibJS/Runtime/AbstractOperations.h>
  11. #include <LibJS/Runtime/GlobalObject.h>
  12. #include <LibJS/Runtime/PrimitiveString.h>
  13. #include <LibJS/Runtime/PropertyKey.h>
  14. #include <LibJS/Runtime/VM.h>
  15. #include <LibJS/Runtime/Value.h>
  16. namespace JS {
  17. PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
  18. : m_is_rope(true)
  19. , m_lhs(&lhs)
  20. , m_rhs(&rhs)
  21. {
  22. }
  23. PrimitiveString::PrimitiveString(DeprecatedString string)
  24. : m_utf8_string(move(string))
  25. {
  26. }
  27. PrimitiveString::PrimitiveString(Utf16String string)
  28. : m_utf16_string(move(string))
  29. {
  30. }
  31. PrimitiveString::~PrimitiveString()
  32. {
  33. if (has_utf8_string())
  34. vm().string_cache().remove(*m_utf8_string);
  35. }
  36. void PrimitiveString::visit_edges(Cell::Visitor& visitor)
  37. {
  38. Cell::visit_edges(visitor);
  39. if (m_is_rope) {
  40. visitor.visit(m_lhs);
  41. visitor.visit(m_rhs);
  42. }
  43. }
  44. bool PrimitiveString::is_empty() const
  45. {
  46. if (m_is_rope) {
  47. // NOTE: We never make an empty rope string.
  48. return false;
  49. }
  50. if (has_utf16_string())
  51. return m_utf16_string->is_empty();
  52. if (has_utf8_string())
  53. return m_utf8_string->is_empty();
  54. VERIFY_NOT_REACHED();
  55. }
  56. ThrowCompletionOr<DeprecatedString const&> PrimitiveString::deprecated_string() const
  57. {
  58. TRY(resolve_rope_if_needed());
  59. if (!has_utf8_string()) {
  60. VERIFY(has_utf16_string());
  61. m_utf8_string = TRY(m_utf16_string->to_utf8(vm()));
  62. }
  63. return *m_utf8_string;
  64. }
  65. ThrowCompletionOr<Utf16String const&> PrimitiveString::utf16_string() const
  66. {
  67. TRY(resolve_rope_if_needed());
  68. if (!has_utf16_string()) {
  69. VERIFY(has_utf8_string());
  70. m_utf16_string = TRY(Utf16String::create(vm(), *m_utf8_string));
  71. }
  72. return *m_utf16_string;
  73. }
  74. ThrowCompletionOr<Utf16View> PrimitiveString::utf16_string_view() const
  75. {
  76. return TRY(utf16_string()).view();
  77. }
  78. ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
  79. {
  80. if (property_key.is_symbol())
  81. return Optional<Value> {};
  82. if (property_key.is_string()) {
  83. if (property_key.as_string() == vm.names.length.as_string()) {
  84. auto length = TRY(utf16_string()).length_in_code_units();
  85. return Value(static_cast<double>(length));
  86. }
  87. }
  88. auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip);
  89. if (!index.is_index())
  90. return Optional<Value> {};
  91. auto str = TRY(utf16_string_view());
  92. auto length = str.length_in_code_units();
  93. if (length <= index.as_index())
  94. return Optional<Value> {};
  95. return create(vm, TRY(Utf16String::create(vm, str.substring_view(index.as_index(), 1))));
  96. }
  97. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String string)
  98. {
  99. if (string.is_empty())
  100. return vm.empty_string();
  101. if (string.length_in_code_units() == 1) {
  102. u16 code_unit = string.code_unit_at(0);
  103. if (is_ascii(code_unit))
  104. return vm.single_ascii_character_string(static_cast<u8>(code_unit));
  105. }
  106. return vm.heap().allocate_without_realm<PrimitiveString>(move(string));
  107. }
  108. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedString string)
  109. {
  110. if (string.is_empty())
  111. return vm.empty_string();
  112. if (string.length() == 1) {
  113. auto ch = static_cast<u8>(string.characters()[0]);
  114. if (is_ascii(ch))
  115. return vm.single_ascii_character_string(ch);
  116. }
  117. auto& string_cache = vm.string_cache();
  118. auto it = string_cache.find(string);
  119. if (it == string_cache.end()) {
  120. auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
  121. string_cache.set(move(string), new_string);
  122. return *new_string;
  123. }
  124. return *it->value;
  125. }
  126. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
  127. {
  128. // We're here to concatenate two strings into a new rope string.
  129. // However, if any of them are empty, no rope is required.
  130. bool lhs_empty = lhs.is_empty();
  131. bool rhs_empty = rhs.is_empty();
  132. if (lhs_empty && rhs_empty)
  133. return vm.empty_string();
  134. if (lhs_empty)
  135. return rhs;
  136. if (rhs_empty)
  137. return lhs;
  138. return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
  139. }
  140. ThrowCompletionOr<void> PrimitiveString::resolve_rope_if_needed() const
  141. {
  142. if (!m_is_rope)
  143. return {};
  144. auto& vm = this->vm();
  145. // NOTE: Special case for two concatenated UTF-16 strings.
  146. // This is here as an optimization, although I'm unsure how valuable it is.
  147. if (m_lhs->has_utf16_string() && m_rhs->has_utf16_string()) {
  148. auto const& lhs_string = TRY(m_lhs->utf16_string());
  149. auto const& rhs_string = TRY(m_rhs->utf16_string());
  150. Utf16Data combined;
  151. combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units());
  152. combined.extend(lhs_string.string());
  153. combined.extend(rhs_string.string());
  154. m_utf16_string = TRY(Utf16String::create(vm, move(combined)));
  155. m_is_rope = false;
  156. m_lhs = nullptr;
  157. m_rhs = nullptr;
  158. return {};
  159. }
  160. // This vector will hold all the pieces of the rope that need to be assembled
  161. // into the resolved string.
  162. Vector<PrimitiveString const*> pieces;
  163. // NOTE: We traverse the rope tree without using recursion, since we'd run out of
  164. // stack space quickly when handling a long sequence of unresolved concatenations.
  165. Vector<PrimitiveString const*> stack;
  166. stack.append(m_rhs);
  167. stack.append(m_lhs);
  168. while (!stack.is_empty()) {
  169. auto* current = stack.take_last();
  170. if (current->m_is_rope) {
  171. stack.append(current->m_rhs);
  172. stack.append(current->m_lhs);
  173. continue;
  174. }
  175. pieces.append(current);
  176. }
  177. // Now that we have all the pieces, we can concatenate them using a StringBuilder.
  178. StringBuilder builder;
  179. // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
  180. PrimitiveString const* previous = nullptr;
  181. for (auto const* current : pieces) {
  182. if (!previous) {
  183. // This is the very first piece, just append it and continue.
  184. builder.append(TRY(current->deprecated_string()));
  185. previous = current;
  186. continue;
  187. }
  188. // Get the UTF-8 representations for both strings.
  189. auto const& previous_string_as_utf8 = TRY(previous->deprecated_string());
  190. auto const& current_string_as_utf8 = TRY(current->deprecated_string());
  191. // NOTE: Now we need to look at the end of the previous string and the start
  192. // of the current string, to see if they should be combined into a surrogate.
  193. // Surrogates encoded as UTF-8 are 3 bytes.
  194. if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
  195. builder.append(TRY(current->deprecated_string()));
  196. previous = current;
  197. continue;
  198. }
  199. // Might the previous string end with a UTF-8 encoded surrogate?
  200. if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
  201. // If not, just append the current string and continue.
  202. builder.append(TRY(current->deprecated_string()));
  203. previous = current;
  204. continue;
  205. }
  206. // Might the current string begin with a UTF-8 encoded surrogate?
  207. if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
  208. // If not, just append the current string and continue.
  209. builder.append(TRY(current->deprecated_string()));
  210. previous = current;
  211. continue;
  212. }
  213. auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
  214. auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
  215. if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
  216. builder.append(TRY(current->deprecated_string()));
  217. previous = current;
  218. continue;
  219. }
  220. // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
  221. builder.trim(3);
  222. builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
  223. // Append the remaining part of the current string.
  224. builder.append(current_string_as_utf8.substring_view(3));
  225. previous = current;
  226. }
  227. m_utf8_string = builder.to_deprecated_string();
  228. m_is_rope = false;
  229. m_lhs = nullptr;
  230. m_rhs = nullptr;
  231. return {};
  232. }
  233. }