PrimitiveString.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/FlyString.h>
  9. #include <AK/Utf16View.h>
  10. #include <AK/Utf8View.h>
  11. #include <LibJS/Runtime/AbstractOperations.h>
  12. #include <LibJS/Runtime/GlobalObject.h>
  13. #include <LibJS/Runtime/PrimitiveString.h>
  14. #include <LibJS/Runtime/PropertyKey.h>
  15. #include <LibJS/Runtime/ThrowableStringBuilder.h>
  16. #include <LibJS/Runtime/VM.h>
  17. #include <LibJS/Runtime/Value.h>
  18. namespace JS {
  19. PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
  20. : m_is_rope(true)
  21. , m_lhs(&lhs)
  22. , m_rhs(&rhs)
  23. {
  24. }
  25. PrimitiveString::PrimitiveString(String string)
  26. : m_utf8_string(move(string))
  27. {
  28. }
  29. PrimitiveString::PrimitiveString(DeprecatedString string)
  30. : m_deprecated_string(move(string))
  31. {
  32. }
  33. PrimitiveString::PrimitiveString(Utf16String string)
  34. : m_utf16_string(move(string))
  35. {
  36. }
  37. PrimitiveString::~PrimitiveString()
  38. {
  39. if (has_utf8_string())
  40. vm().string_cache().remove(*m_utf8_string);
  41. if (has_deprecated_string())
  42. vm().deprecated_string_cache().remove(*m_deprecated_string);
  43. }
  44. void PrimitiveString::visit_edges(Cell::Visitor& visitor)
  45. {
  46. Base::visit_edges(visitor);
  47. if (m_is_rope) {
  48. visitor.visit(m_lhs);
  49. visitor.visit(m_rhs);
  50. }
  51. }
  52. bool PrimitiveString::is_empty() const
  53. {
  54. if (m_is_rope) {
  55. // NOTE: We never make an empty rope string.
  56. return false;
  57. }
  58. if (has_utf16_string())
  59. return m_utf16_string->is_empty();
  60. if (has_utf8_string())
  61. return m_utf8_string->is_empty();
  62. if (has_deprecated_string())
  63. return m_deprecated_string->is_empty();
  64. VERIFY_NOT_REACHED();
  65. }
  66. ThrowCompletionOr<String> PrimitiveString::utf8_string() const
  67. {
  68. auto& vm = this->vm();
  69. resolve_rope_if_needed(EncodingPreference::UTF8);
  70. if (!has_utf8_string()) {
  71. if (has_deprecated_string())
  72. m_utf8_string = TRY_OR_THROW_OOM(vm, String::from_deprecated_string(*m_deprecated_string));
  73. else if (has_utf16_string())
  74. m_utf8_string = TRY(m_utf16_string->to_utf8(vm));
  75. else
  76. VERIFY_NOT_REACHED();
  77. }
  78. return *m_utf8_string;
  79. }
  80. ThrowCompletionOr<StringView> PrimitiveString::utf8_string_view() const
  81. {
  82. (void)TRY(utf8_string());
  83. return m_utf8_string->bytes_as_string_view();
  84. }
  85. ThrowCompletionOr<DeprecatedString> PrimitiveString::deprecated_string() const
  86. {
  87. resolve_rope_if_needed(EncodingPreference::UTF8);
  88. if (!has_deprecated_string()) {
  89. if (has_utf8_string())
  90. m_deprecated_string = m_utf8_string->to_deprecated_string();
  91. else if (has_utf16_string())
  92. m_deprecated_string = TRY(m_utf16_string->to_deprecated_string(vm()));
  93. else
  94. VERIFY_NOT_REACHED();
  95. }
  96. return *m_deprecated_string;
  97. }
  98. ThrowCompletionOr<Utf16String> PrimitiveString::utf16_string() const
  99. {
  100. resolve_rope_if_needed(EncodingPreference::UTF16);
  101. if (!has_utf16_string()) {
  102. if (has_utf8_string()) {
  103. m_utf16_string = TRY(Utf16String::create(vm(), m_utf8_string->bytes_as_string_view()));
  104. } else {
  105. VERIFY(has_deprecated_string());
  106. m_utf16_string = TRY(Utf16String::create(vm(), *m_deprecated_string));
  107. }
  108. }
  109. return *m_utf16_string;
  110. }
  111. ThrowCompletionOr<Utf16View> PrimitiveString::utf16_string_view() const
  112. {
  113. (void)TRY(utf16_string());
  114. return m_utf16_string->view();
  115. }
  116. ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
  117. {
  118. if (property_key.is_symbol())
  119. return Optional<Value> {};
  120. if (property_key.is_string()) {
  121. if (property_key.as_string() == vm.names.length.as_string()) {
  122. auto length = TRY(utf16_string()).length_in_code_units();
  123. return Value(static_cast<double>(length));
  124. }
  125. }
  126. auto index = MUST_OR_THROW_OOM(canonical_numeric_index_string(vm, property_key, CanonicalIndexMode::IgnoreNumericRoundtrip));
  127. if (!index.is_index())
  128. return Optional<Value> {};
  129. auto str = TRY(utf16_string_view());
  130. auto length = str.length_in_code_units();
  131. if (length <= index.as_index())
  132. return Optional<Value> {};
  133. return create(vm, TRY(Utf16String::create(vm, str.substring_view(index.as_index(), 1))));
  134. }
  135. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String string)
  136. {
  137. if (string.is_empty())
  138. return vm.empty_string();
  139. if (string.length_in_code_units() == 1) {
  140. u16 code_unit = string.code_unit_at(0);
  141. if (is_ascii(code_unit))
  142. return vm.single_ascii_character_string(static_cast<u8>(code_unit));
  143. }
  144. return vm.heap().allocate_without_realm<PrimitiveString>(move(string));
  145. }
  146. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, String string)
  147. {
  148. if (string.is_empty())
  149. return vm.empty_string();
  150. if (auto bytes = string.bytes_as_string_view(); bytes.length() == 1) {
  151. auto ch = static_cast<u8>(bytes[0]);
  152. if (is_ascii(ch))
  153. return vm.single_ascii_character_string(ch);
  154. }
  155. auto& string_cache = vm.string_cache();
  156. if (auto it = string_cache.find(string); it != string_cache.end())
  157. return *it->value;
  158. auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
  159. string_cache.set(move(string), new_string);
  160. return *new_string;
  161. }
  162. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, FlyString const& string)
  163. {
  164. return create(vm, string.to_string());
  165. }
  166. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, StringView string)
  167. {
  168. return create(vm, String::from_utf8(string).release_value());
  169. }
  170. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedString string)
  171. {
  172. if (string.is_empty())
  173. return vm.empty_string();
  174. if (string.length() == 1) {
  175. auto ch = static_cast<u8>(string.characters()[0]);
  176. if (is_ascii(ch))
  177. return vm.single_ascii_character_string(ch);
  178. }
  179. auto& string_cache = vm.deprecated_string_cache();
  180. auto it = string_cache.find(string);
  181. if (it == string_cache.end()) {
  182. auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
  183. string_cache.set(move(string), new_string);
  184. return *new_string;
  185. }
  186. return *it->value;
  187. }
  188. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedFlyString const& string)
  189. {
  190. return create(vm, string.impl());
  191. }
  192. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
  193. {
  194. // We're here to concatenate two strings into a new rope string.
  195. // However, if any of them are empty, no rope is required.
  196. bool lhs_empty = lhs.is_empty();
  197. bool rhs_empty = rhs.is_empty();
  198. if (lhs_empty && rhs_empty)
  199. return vm.empty_string();
  200. if (lhs_empty)
  201. return rhs;
  202. if (rhs_empty)
  203. return lhs;
  204. return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
  205. }
  206. void PrimitiveString::resolve_rope_if_needed(EncodingPreference preference) const
  207. {
  208. if (!m_is_rope)
  209. return;
  210. auto& vm = this->vm();
  211. // This vector will hold all the pieces of the rope that need to be assembled
  212. // into the resolved string.
  213. Vector<PrimitiveString const*> pieces;
  214. // NOTE: We traverse the rope tree without using recursion, since we'd run out of
  215. // stack space quickly when handling a long sequence of unresolved concatenations.
  216. Vector<PrimitiveString const*> stack;
  217. stack.append(m_rhs);
  218. stack.append(m_lhs);
  219. while (!stack.is_empty()) {
  220. auto const* current = stack.take_last();
  221. if (current->m_is_rope) {
  222. stack.append(current->m_rhs);
  223. stack.append(current->m_lhs);
  224. continue;
  225. }
  226. pieces.append(current);
  227. }
  228. if (preference == EncodingPreference::UTF16) {
  229. // The caller wants a UTF-16 string, so we can simply concatenate all the pieces
  230. // into a UTF-16 code unit buffer and create a Utf16String from it.
  231. Utf16Data code_units;
  232. for (auto const* current : pieces)
  233. code_units.extend(MUST(current->utf16_string()).string());
  234. m_utf16_string = MUST(Utf16String::create(vm, move(code_units)));
  235. m_is_rope = false;
  236. m_lhs = nullptr;
  237. m_rhs = nullptr;
  238. return;
  239. }
  240. // Now that we have all the pieces, we can concatenate them using a StringBuilder.
  241. StringBuilder builder;
  242. // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
  243. PrimitiveString const* previous = nullptr;
  244. for (auto const* current : pieces) {
  245. if (!previous) {
  246. // This is the very first piece, just append it and continue.
  247. builder.append(MUST(current->utf8_string()));
  248. previous = current;
  249. continue;
  250. }
  251. // Get the UTF-8 representations for both strings.
  252. auto current_string_as_utf8 = MUST(current->utf8_string_view());
  253. auto previous_string_as_utf8 = MUST(previous->utf8_string_view());
  254. // NOTE: Now we need to look at the end of the previous string and the start
  255. // of the current string, to see if they should be combined into a surrogate.
  256. // Surrogates encoded as UTF-8 are 3 bytes.
  257. if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
  258. builder.append(current_string_as_utf8);
  259. previous = current;
  260. continue;
  261. }
  262. // Might the previous string end with a UTF-8 encoded surrogate?
  263. if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
  264. // If not, just append the current string and continue.
  265. builder.append(current_string_as_utf8);
  266. previous = current;
  267. continue;
  268. }
  269. // Might the current string begin with a UTF-8 encoded surrogate?
  270. if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
  271. // If not, just append the current string and continue.
  272. builder.append(current_string_as_utf8);
  273. previous = current;
  274. continue;
  275. }
  276. auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
  277. auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
  278. if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
  279. builder.append(current_string_as_utf8);
  280. previous = current;
  281. continue;
  282. }
  283. // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
  284. builder.trim(3);
  285. builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
  286. // Append the remaining part of the current string.
  287. builder.append(current_string_as_utf8.substring_view(3));
  288. previous = current;
  289. }
  290. m_utf8_string = MUST(builder.to_string());
  291. m_is_rope = false;
  292. m_lhs = nullptr;
  293. m_rhs = nullptr;
  294. }
  295. }