PrimitiveString.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
  3. * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/FlyString.h>
  9. #include <AK/StringBuilder.h>
  10. #include <AK/Utf16View.h>
  11. #include <AK/Utf8View.h>
  12. #include <LibJS/Runtime/AbstractOperations.h>
  13. #include <LibJS/Runtime/GlobalObject.h>
  14. #include <LibJS/Runtime/PrimitiveString.h>
  15. #include <LibJS/Runtime/PropertyKey.h>
  16. #include <LibJS/Runtime/VM.h>
  17. #include <LibJS/Runtime/Value.h>
  18. namespace JS {
  19. JS_DEFINE_ALLOCATOR(PrimitiveString);
  20. PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
  21. : m_is_rope(true)
  22. , m_lhs(&lhs)
  23. , m_rhs(&rhs)
  24. {
  25. }
  26. PrimitiveString::PrimitiveString(String string)
  27. : m_utf8_string(move(string))
  28. {
  29. }
  30. PrimitiveString::PrimitiveString(ByteString string)
  31. : m_byte_string(move(string))
  32. {
  33. }
  34. PrimitiveString::PrimitiveString(Utf16String string)
  35. : m_utf16_string(move(string))
  36. {
  37. }
  38. PrimitiveString::~PrimitiveString()
  39. {
  40. if (has_utf8_string())
  41. vm().string_cache().remove(*m_utf8_string);
  42. if (has_utf16_string())
  43. vm().utf16_string_cache().remove(*m_utf16_string);
  44. if (has_byte_string())
  45. vm().byte_string_cache().remove(*m_byte_string);
  46. }
  47. void PrimitiveString::visit_edges(Cell::Visitor& visitor)
  48. {
  49. Base::visit_edges(visitor);
  50. if (m_is_rope) {
  51. visitor.visit(m_lhs);
  52. visitor.visit(m_rhs);
  53. }
  54. }
  55. bool PrimitiveString::is_empty() const
  56. {
  57. if (m_is_rope) {
  58. // NOTE: We never make an empty rope string.
  59. return false;
  60. }
  61. if (has_utf16_string())
  62. return m_utf16_string->is_empty();
  63. if (has_utf8_string())
  64. return m_utf8_string->is_empty();
  65. if (has_byte_string())
  66. return m_byte_string->is_empty();
  67. VERIFY_NOT_REACHED();
  68. }
  69. String PrimitiveString::utf8_string() const
  70. {
  71. resolve_rope_if_needed(EncodingPreference::UTF8);
  72. if (!has_utf8_string()) {
  73. if (has_byte_string())
  74. m_utf8_string = MUST(String::from_byte_string(*m_byte_string));
  75. else if (has_utf16_string())
  76. m_utf8_string = m_utf16_string->to_utf8();
  77. else
  78. VERIFY_NOT_REACHED();
  79. }
  80. return *m_utf8_string;
  81. }
  82. StringView PrimitiveString::utf8_string_view() const
  83. {
  84. (void)utf8_string();
  85. return m_utf8_string->bytes_as_string_view();
  86. }
  87. ByteString PrimitiveString::byte_string() const
  88. {
  89. resolve_rope_if_needed(EncodingPreference::UTF8);
  90. if (!has_byte_string()) {
  91. if (has_utf8_string())
  92. m_byte_string = m_utf8_string->to_byte_string();
  93. else if (has_utf16_string())
  94. m_byte_string = m_utf16_string->to_byte_string();
  95. else
  96. VERIFY_NOT_REACHED();
  97. }
  98. return *m_byte_string;
  99. }
  100. Utf16String PrimitiveString::utf16_string() const
  101. {
  102. resolve_rope_if_needed(EncodingPreference::UTF16);
  103. if (!has_utf16_string()) {
  104. if (has_utf8_string()) {
  105. m_utf16_string = Utf16String::create(m_utf8_string->bytes_as_string_view());
  106. } else {
  107. VERIFY(has_byte_string());
  108. m_utf16_string = Utf16String::create(*m_byte_string);
  109. }
  110. }
  111. return *m_utf16_string;
  112. }
  113. Utf16View PrimitiveString::utf16_string_view() const
  114. {
  115. (void)utf16_string();
  116. return m_utf16_string->view();
  117. }
  118. ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
  119. {
  120. if (property_key.is_symbol())
  121. return Optional<Value> {};
  122. if (property_key.is_string()) {
  123. if (property_key.as_string() == vm.names.length.as_string()) {
  124. auto length = utf16_string().length_in_code_units();
  125. return Value(static_cast<double>(length));
  126. }
  127. }
  128. auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip);
  129. if (!index.is_index())
  130. return Optional<Value> {};
  131. auto str = utf16_string_view();
  132. auto length = str.length_in_code_units();
  133. if (length <= index.as_index())
  134. return Optional<Value> {};
  135. return create(vm, Utf16String::create(str.substring_view(index.as_index(), 1)));
  136. }
  137. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String string)
  138. {
  139. if (string.is_empty())
  140. return vm.empty_string();
  141. if (string.length_in_code_units() == 1) {
  142. u16 code_unit = string.code_unit_at(0);
  143. if (is_ascii(code_unit))
  144. return vm.single_ascii_character_string(static_cast<u8>(code_unit));
  145. }
  146. auto& string_cache = vm.utf16_string_cache();
  147. if (auto it = string_cache.find(string); it != string_cache.end())
  148. return *it->value;
  149. auto new_string = vm.heap().allocate<PrimitiveString>(string);
  150. string_cache.set(move(string), new_string);
  151. return *new_string;
  152. }
  153. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, String string)
  154. {
  155. if (string.is_empty())
  156. return vm.empty_string();
  157. if (auto bytes = string.bytes_as_string_view(); bytes.length() == 1) {
  158. auto ch = static_cast<u8>(bytes[0]);
  159. if (is_ascii(ch))
  160. return vm.single_ascii_character_string(ch);
  161. }
  162. auto& string_cache = vm.string_cache();
  163. if (auto it = string_cache.find(string); it != string_cache.end())
  164. return *it->value;
  165. auto new_string = vm.heap().allocate<PrimitiveString>(string);
  166. string_cache.set(move(string), new_string);
  167. return *new_string;
  168. }
  169. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, FlyString const& string)
  170. {
  171. return create(vm, string.to_string());
  172. }
  173. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, StringView string)
  174. {
  175. return create(vm, String::from_utf8(string).release_value());
  176. }
  177. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, ByteString string)
  178. {
  179. if (string.is_empty())
  180. return vm.empty_string();
  181. if (string.length() == 1) {
  182. auto ch = static_cast<u8>(string.characters()[0]);
  183. if (is_ascii(ch))
  184. return vm.single_ascii_character_string(ch);
  185. }
  186. auto& string_cache = vm.byte_string_cache();
  187. auto it = string_cache.find(string);
  188. if (it == string_cache.end()) {
  189. auto new_string = vm.heap().allocate<PrimitiveString>(string);
  190. string_cache.set(move(string), new_string);
  191. return *new_string;
  192. }
  193. return *it->value;
  194. }
  195. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedFlyString const& string)
  196. {
  197. return create(vm, ByteString { string });
  198. }
  199. NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
  200. {
  201. // We're here to concatenate two strings into a new rope string.
  202. // However, if any of them are empty, no rope is required.
  203. bool lhs_empty = lhs.is_empty();
  204. bool rhs_empty = rhs.is_empty();
  205. if (lhs_empty && rhs_empty)
  206. return vm.empty_string();
  207. if (lhs_empty)
  208. return rhs;
  209. if (rhs_empty)
  210. return lhs;
  211. return vm.heap().allocate<PrimitiveString>(lhs, rhs);
  212. }
  213. void PrimitiveString::resolve_rope_if_needed(EncodingPreference preference) const
  214. {
  215. if (!m_is_rope)
  216. return;
  217. // This vector will hold all the pieces of the rope that need to be assembled
  218. // into the resolved string.
  219. Vector<PrimitiveString const*> pieces;
  220. size_t approximate_length = 0;
  221. // NOTE: We traverse the rope tree without using recursion, since we'd run out of
  222. // stack space quickly when handling a long sequence of unresolved concatenations.
  223. Vector<PrimitiveString const*> stack;
  224. stack.append(m_rhs);
  225. stack.append(m_lhs);
  226. while (!stack.is_empty()) {
  227. auto const* current = stack.take_last();
  228. if (current->m_is_rope) {
  229. stack.append(current->m_rhs);
  230. stack.append(current->m_lhs);
  231. continue;
  232. }
  233. if (current->has_utf8_string())
  234. approximate_length += current->utf8_string_view().length();
  235. pieces.append(current);
  236. }
  237. if (preference == EncodingPreference::UTF16) {
  238. // The caller wants a UTF-16 string, so we can simply concatenate all the pieces
  239. // into a UTF-16 code unit buffer and create a Utf16String from it.
  240. Utf16Data code_units;
  241. for (auto const* current : pieces)
  242. code_units.extend(current->utf16_string().string());
  243. m_utf16_string = Utf16String::create(move(code_units));
  244. m_is_rope = false;
  245. m_lhs = nullptr;
  246. m_rhs = nullptr;
  247. return;
  248. }
  249. // Now that we have all the pieces, we can concatenate them using a StringBuilder.
  250. StringBuilder builder(approximate_length);
  251. // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
  252. PrimitiveString const* previous = nullptr;
  253. for (auto const* current : pieces) {
  254. if (!previous) {
  255. // This is the very first piece, just append it and continue.
  256. builder.append(current->utf8_string());
  257. previous = current;
  258. continue;
  259. }
  260. // Get the UTF-8 representations for both strings.
  261. auto current_string_as_utf8 = current->utf8_string_view();
  262. auto previous_string_as_utf8 = previous->utf8_string_view();
  263. // NOTE: Now we need to look at the end of the previous string and the start
  264. // of the current string, to see if they should be combined into a surrogate.
  265. // Surrogates encoded as UTF-8 are 3 bytes.
  266. if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
  267. builder.append(current_string_as_utf8);
  268. previous = current;
  269. continue;
  270. }
  271. // Might the previous string end with a UTF-8 encoded surrogate?
  272. if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
  273. // If not, just append the current string and continue.
  274. builder.append(current_string_as_utf8);
  275. previous = current;
  276. continue;
  277. }
  278. // Might the current string begin with a UTF-8 encoded surrogate?
  279. if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
  280. // If not, just append the current string and continue.
  281. builder.append(current_string_as_utf8);
  282. previous = current;
  283. continue;
  284. }
  285. auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
  286. auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
  287. if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
  288. builder.append(current_string_as_utf8);
  289. previous = current;
  290. continue;
  291. }
  292. // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
  293. builder.trim(3);
  294. builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
  295. // Append the remaining part of the current string.
  296. builder.append(current_string_as_utf8.substring_view(3));
  297. previous = current;
  298. }
  299. // NOTE: We've already produced valid UTF-8 above, so there's no need for additional validation.
  300. m_utf8_string = builder.to_string_without_validation();
  301. m_is_rope = false;
  302. m_lhs = nullptr;
  303. m_rhs = nullptr;
  304. }
  305. }