PrimitiveString.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Utf16View.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibJS/Runtime/AbstractOperations.h>
  11. #include <LibJS/Runtime/GlobalObject.h>
  12. #include <LibJS/Runtime/PrimitiveString.h>
  13. #include <LibJS/Runtime/PropertyKey.h>
  14. #include <LibJS/Runtime/ThrowableStringBuilder.h>
  15. #include <LibJS/Runtime/VM.h>
  16. #include <LibJS/Runtime/Value.h>
  17. namespace JS {
  18. PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
  19. : m_is_rope(true)
  20. , m_lhs(&lhs)
  21. , m_rhs(&rhs)
  22. {
  23. }
  24. PrimitiveString::PrimitiveString(String string)
  25. : m_utf8_string(move(string))
  26. {
  27. }
  28. PrimitiveString::PrimitiveString(DeprecatedString string)
  29. : m_deprecated_string(move(string))
  30. {
  31. }
  32. PrimitiveString::PrimitiveString(Utf16String string)
  33. : m_utf16_string(move(string))
  34. {
  35. }
  36. PrimitiveString::~PrimitiveString()
  37. {
  38. if (has_utf8_string())
  39. vm().string_cache().remove(*m_utf8_string);
  40. if (has_deprecated_string())
  41. vm().deprecated_string_cache().remove(*m_deprecated_string);
  42. }
  43. void PrimitiveString::visit_edges(Cell::Visitor& visitor)
  44. {
  45. Cell::visit_edges(visitor);
  46. if (m_is_rope) {
  47. visitor.visit(m_lhs);
  48. visitor.visit(m_rhs);
  49. }
  50. }
  51. bool PrimitiveString::is_empty() const
  52. {
  53. if (m_is_rope) {
  54. // NOTE: We never make an empty rope string.
  55. return false;
  56. }
  57. if (has_utf16_string())
  58. return m_utf16_string->is_empty();
  59. if (has_utf8_string())
  60. return m_utf8_string->is_empty();
  61. if (has_deprecated_string())
  62. return m_deprecated_string->is_empty();
  63. VERIFY_NOT_REACHED();
  64. }
  65. ThrowCompletionOr<String> PrimitiveString::utf8_string() const
  66. {
  67. auto& vm = this->vm();
  68. TRY(resolve_rope_if_needed());
  69. if (!has_utf8_string()) {
  70. if (has_deprecated_string())
  71. m_utf8_string = TRY_OR_THROW_OOM(vm, String::from_utf8(*m_deprecated_string));
  72. else if (has_utf16_string())
  73. m_utf8_string = TRY(m_utf16_string->to_utf8(vm));
  74. else
  75. VERIFY_NOT_REACHED();
  76. }
  77. return *m_utf8_string;
  78. }
  79. ThrowCompletionOr<DeprecatedString> PrimitiveString::deprecated_string() const
  80. {
  81. TRY(resolve_rope_if_needed());
  82. if (!has_deprecated_string()) {
  83. if (has_utf8_string())
  84. m_deprecated_string = m_utf8_string->to_deprecated_string();
  85. else if (has_utf16_string())
  86. m_deprecated_string = TRY(m_utf16_string->to_deprecated_string(vm()));
  87. else
  88. VERIFY_NOT_REACHED();
  89. }
  90. return *m_deprecated_string;
  91. }
  92. ThrowCompletionOr<Utf16String> PrimitiveString::utf16_string() const
  93. {
  94. TRY(resolve_rope_if_needed());
  95. if (!has_utf16_string()) {
  96. if (has_utf8_string()) {
  97. m_utf16_string = TRY(Utf16String::create(vm(), m_utf8_string->bytes_as_string_view()));
  98. } else {
  99. VERIFY(has_deprecated_string());
  100. m_utf16_string = TRY(Utf16String::create(vm(), *m_deprecated_string));
  101. }
  102. }
  103. return *m_utf16_string;
  104. }
  105. ThrowCompletionOr<Utf16View> PrimitiveString::utf16_string_view() const
  106. {
  107. (void)TRY(utf16_string());
  108. return m_utf16_string->view();
  109. }
  110. ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
  111. {
  112. if (property_key.is_symbol())
  113. return Optional<Value> {};
  114. if (property_key.is_string()) {
  115. if (property_key.as_string() == vm.names.length.as_string()) {
  116. auto length = TRY(utf16_string()).length_in_code_units();
  117. return Value(static_cast<double>(length));
  118. }
  119. }
  120. auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip);
  121. if (!index.is_index())
  122. return Optional<Value> {};
  123. auto str = TRY(utf16_string_view());
  124. auto length = str.length_in_code_units();
  125. if (length <= index.as_index())
  126. return Optional<Value> {};
  127. return create(vm, TRY(Utf16String::create(vm, str.substring_view(index.as_index(), 1))));
  128. }
  129. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String string)
  130. {
  131. if (string.is_empty())
  132. return vm.empty_string();
  133. if (string.length_in_code_units() == 1) {
  134. u16 code_unit = string.code_unit_at(0);
  135. if (is_ascii(code_unit))
  136. return vm.single_ascii_character_string(static_cast<u8>(code_unit));
  137. }
  138. return vm.heap().allocate_without_realm<PrimitiveString>(move(string));
  139. }
  140. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, String string)
  141. {
  142. if (string.is_empty())
  143. return vm.empty_string();
  144. if (auto bytes = string.bytes_as_string_view(); bytes.length() == 1) {
  145. auto ch = static_cast<u8>(bytes[0]);
  146. if (is_ascii(ch))
  147. return vm.single_ascii_character_string(ch);
  148. }
  149. auto& string_cache = vm.string_cache();
  150. if (auto it = string_cache.find(string); it != string_cache.end())
  151. return *it->value;
  152. auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
  153. string_cache.set(move(string), new_string);
  154. return *new_string;
  155. }
  156. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedString string)
  157. {
  158. if (string.is_empty())
  159. return vm.empty_string();
  160. if (string.length() == 1) {
  161. auto ch = static_cast<u8>(string.characters()[0]);
  162. if (is_ascii(ch))
  163. return vm.single_ascii_character_string(ch);
  164. }
  165. auto& string_cache = vm.deprecated_string_cache();
  166. auto it = string_cache.find(string);
  167. if (it == string_cache.end()) {
  168. auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
  169. string_cache.set(move(string), new_string);
  170. return *new_string;
  171. }
  172. return *it->value;
  173. }
  174. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
  175. {
  176. // We're here to concatenate two strings into a new rope string.
  177. // However, if any of them are empty, no rope is required.
  178. bool lhs_empty = lhs.is_empty();
  179. bool rhs_empty = rhs.is_empty();
  180. if (lhs_empty && rhs_empty)
  181. return vm.empty_string();
  182. if (lhs_empty)
  183. return rhs;
  184. if (rhs_empty)
  185. return lhs;
  186. return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
  187. }
  188. ThrowCompletionOr<void> PrimitiveString::resolve_rope_if_needed() const
  189. {
  190. if (!m_is_rope)
  191. return {};
  192. auto& vm = this->vm();
  193. // NOTE: Special case for two concatenated UTF-16 strings.
  194. // This is here as an optimization, although I'm unsure how valuable it is.
  195. if (m_lhs->has_utf16_string() && m_rhs->has_utf16_string()) {
  196. auto const& lhs_string = m_lhs->m_utf16_string.value();
  197. auto const& rhs_string = m_rhs->m_utf16_string.value();
  198. Utf16Data combined;
  199. TRY_OR_THROW_OOM(vm, combined.try_ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units()));
  200. combined.extend(lhs_string.string());
  201. combined.extend(rhs_string.string());
  202. m_utf16_string = TRY(Utf16String::create(vm, move(combined)));
  203. m_is_rope = false;
  204. m_lhs = nullptr;
  205. m_rhs = nullptr;
  206. return {};
  207. }
  208. // This vector will hold all the pieces of the rope that need to be assembled
  209. // into the resolved string.
  210. Vector<PrimitiveString const*> pieces;
  211. // NOTE: We traverse the rope tree without using recursion, since we'd run out of
  212. // stack space quickly when handling a long sequence of unresolved concatenations.
  213. Vector<PrimitiveString const*> stack;
  214. TRY_OR_THROW_OOM(vm, stack.try_append(m_rhs));
  215. TRY_OR_THROW_OOM(vm, stack.try_append(m_lhs));
  216. while (!stack.is_empty()) {
  217. auto const* current = stack.take_last();
  218. if (current->m_is_rope) {
  219. TRY_OR_THROW_OOM(vm, stack.try_append(current->m_rhs));
  220. TRY_OR_THROW_OOM(vm, stack.try_append(current->m_lhs));
  221. continue;
  222. }
  223. TRY_OR_THROW_OOM(vm, pieces.try_append(current));
  224. }
  225. // Now that we have all the pieces, we can concatenate them using a StringBuilder.
  226. ThrowableStringBuilder builder(vm);
  227. // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
  228. PrimitiveString const* previous = nullptr;
  229. for (auto const* current : pieces) {
  230. if (!previous) {
  231. // This is the very first piece, just append it and continue.
  232. TRY(builder.append(TRY(current->utf8_string())));
  233. previous = current;
  234. continue;
  235. }
  236. // Get the UTF-8 representations for both strings.
  237. auto current_string_as_utf8_string = TRY(current->utf8_string());
  238. auto current_string_as_utf8 = current_string_as_utf8_string.bytes_as_string_view();
  239. auto previous_string_as_utf8_string = TRY(previous->utf8_string());
  240. auto previous_string_as_utf8 = previous_string_as_utf8_string.bytes_as_string_view();
  241. // NOTE: Now we need to look at the end of the previous string and the start
  242. // of the current string, to see if they should be combined into a surrogate.
  243. // Surrogates encoded as UTF-8 are 3 bytes.
  244. if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
  245. TRY(builder.append(current_string_as_utf8));
  246. previous = current;
  247. continue;
  248. }
  249. // Might the previous string end with a UTF-8 encoded surrogate?
  250. if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
  251. // If not, just append the current string and continue.
  252. TRY(builder.append(current_string_as_utf8));
  253. previous = current;
  254. continue;
  255. }
  256. // Might the current string begin with a UTF-8 encoded surrogate?
  257. if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
  258. // If not, just append the current string and continue.
  259. TRY(builder.append(current_string_as_utf8));
  260. previous = current;
  261. continue;
  262. }
  263. auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
  264. auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
  265. if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
  266. TRY(builder.append(current_string_as_utf8));
  267. previous = current;
  268. continue;
  269. }
  270. // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
  271. builder.trim(3);
  272. TRY(builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate)));
  273. // Append the remaining part of the current string.
  274. TRY(builder.append(current_string_as_utf8.substring_view(3)));
  275. previous = current;
  276. }
  277. m_utf8_string = TRY(builder.to_string());
  278. m_is_rope = false;
  279. m_lhs = nullptr;
  280. m_rhs = nullptr;
  281. return {};
  282. }
  283. }