PrimitiveString.cpp 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Utf16View.h>
  8. #include <LibJS/Runtime/AbstractOperations.h>
  9. #include <LibJS/Runtime/GlobalObject.h>
  10. #include <LibJS/Runtime/PrimitiveString.h>
  11. #include <LibJS/Runtime/PropertyKey.h>
  12. #include <LibJS/Runtime/VM.h>
  13. #include <LibJS/Runtime/Value.h>
  14. namespace JS {
  15. PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
  16. : m_is_rope(true)
  17. , m_lhs(&lhs)
  18. , m_rhs(&rhs)
  19. {
  20. }
  21. PrimitiveString::PrimitiveString(String string)
  22. : m_has_utf8_string(true)
  23. , m_utf8_string(move(string))
  24. {
  25. }
  26. PrimitiveString::PrimitiveString(Utf16String string)
  27. : m_has_utf16_string(true)
  28. , m_utf16_string(move(string))
  29. {
  30. }
  31. PrimitiveString::~PrimitiveString()
  32. {
  33. vm().string_cache().remove(m_utf8_string);
  34. }
  35. void PrimitiveString::visit_edges(Cell::Visitor& visitor)
  36. {
  37. Cell::visit_edges(visitor);
  38. if (m_is_rope) {
  39. visitor.visit(m_lhs);
  40. visitor.visit(m_rhs);
  41. }
  42. }
  43. bool PrimitiveString::is_empty() const
  44. {
  45. if (m_is_rope) {
  46. // NOTE: We never make an empty rope string.
  47. return false;
  48. }
  49. if (m_has_utf16_string)
  50. return m_utf16_string.is_empty();
  51. if (m_has_utf8_string)
  52. return m_utf8_string.is_empty();
  53. VERIFY_NOT_REACHED();
  54. }
  55. String const& PrimitiveString::string() const
  56. {
  57. resolve_rope_if_needed();
  58. if (!m_has_utf8_string) {
  59. m_utf8_string = m_utf16_string.to_utf8();
  60. m_has_utf8_string = true;
  61. }
  62. return m_utf8_string;
  63. }
  64. Utf16String const& PrimitiveString::utf16_string() const
  65. {
  66. resolve_rope_if_needed();
  67. if (!m_has_utf16_string) {
  68. m_utf16_string = Utf16String(m_utf8_string);
  69. m_has_utf16_string = true;
  70. }
  71. return m_utf16_string;
  72. }
  73. Utf16View PrimitiveString::utf16_string_view() const
  74. {
  75. return utf16_string().view();
  76. }
  77. Optional<Value> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
  78. {
  79. if (property_key.is_symbol())
  80. return {};
  81. if (property_key.is_string()) {
  82. if (property_key.as_string() == vm.names.length.as_string()) {
  83. auto length = utf16_string().length_in_code_units();
  84. return Value(static_cast<double>(length));
  85. }
  86. }
  87. auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip);
  88. if (!index.is_index())
  89. return {};
  90. auto str = utf16_string_view();
  91. auto length = str.length_in_code_units();
  92. if (length <= index.as_index())
  93. return {};
  94. return js_string(vm, str.substring_view(index.as_index(), 1));
  95. }
  96. PrimitiveString* js_string(Heap& heap, Utf16View const& view)
  97. {
  98. return js_string(heap, Utf16String(view));
  99. }
  100. PrimitiveString* js_string(VM& vm, Utf16View const& view)
  101. {
  102. return js_string(vm.heap(), view);
  103. }
  104. PrimitiveString* js_string(Heap& heap, Utf16String string)
  105. {
  106. if (string.is_empty())
  107. return &heap.vm().empty_string();
  108. if (string.length_in_code_units() == 1) {
  109. u16 code_unit = string.code_unit_at(0);
  110. if (is_ascii(code_unit))
  111. return &heap.vm().single_ascii_character_string(static_cast<u8>(code_unit));
  112. }
  113. return heap.allocate_without_realm<PrimitiveString>(move(string));
  114. }
  115. PrimitiveString* js_string(VM& vm, Utf16String string)
  116. {
  117. return js_string(vm.heap(), move(string));
  118. }
  119. PrimitiveString* js_string(Heap& heap, String string)
  120. {
  121. if (string.is_empty())
  122. return &heap.vm().empty_string();
  123. if (string.length() == 1) {
  124. auto ch = static_cast<u8>(string.characters()[0]);
  125. if (is_ascii(ch))
  126. return &heap.vm().single_ascii_character_string(ch);
  127. }
  128. auto& string_cache = heap.vm().string_cache();
  129. auto it = string_cache.find(string);
  130. if (it == string_cache.end()) {
  131. auto* new_string = heap.allocate_without_realm<PrimitiveString>(string);
  132. string_cache.set(move(string), new_string);
  133. return new_string;
  134. }
  135. return it->value;
  136. }
  137. PrimitiveString* js_string(VM& vm, String string)
  138. {
  139. return js_string(vm.heap(), move(string));
  140. }
  141. PrimitiveString* js_rope_string(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
  142. {
  143. // We're here to concatenate two strings into a new rope string.
  144. // However, if any of them are empty, no rope is required.
  145. bool lhs_empty = lhs.is_empty();
  146. bool rhs_empty = rhs.is_empty();
  147. if (lhs_empty && rhs_empty)
  148. return &vm.empty_string();
  149. if (lhs_empty)
  150. return &rhs;
  151. if (rhs_empty)
  152. return &lhs;
  153. return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
  154. }
  155. void PrimitiveString::resolve_rope_if_needed() const
  156. {
  157. if (!m_is_rope)
  158. return;
  159. // NOTE: Special case for two concatenated UTF-16 strings.
  160. // This is here as an optimization, although I'm unsure how valuable it is.
  161. if (m_lhs->has_utf16_string() && m_rhs->has_utf16_string()) {
  162. auto const& lhs_string = m_lhs->utf16_string();
  163. auto const& rhs_string = m_rhs->utf16_string();
  164. Vector<u16, 1> combined;
  165. combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units());
  166. combined.extend(lhs_string.string());
  167. combined.extend(rhs_string.string());
  168. m_utf16_string = Utf16String(move(combined));
  169. m_has_utf16_string = true;
  170. m_is_rope = false;
  171. m_lhs = nullptr;
  172. m_rhs = nullptr;
  173. return;
  174. }
  175. // This vector will hold all the pieces of the rope that need to be assembled
  176. // into the resolved string.
  177. Vector<PrimitiveString const*> pieces;
  178. // NOTE: We traverse the rope tree without using recursion, since we'd run out of
  179. // stack space quickly when handling a long sequence of unresolved concatenations.
  180. Vector<PrimitiveString const*> stack;
  181. stack.append(m_rhs);
  182. stack.append(m_lhs);
  183. while (!stack.is_empty()) {
  184. auto* current = stack.take_last();
  185. if (current->m_is_rope) {
  186. stack.append(current->m_rhs);
  187. stack.append(current->m_lhs);
  188. continue;
  189. }
  190. pieces.append(current);
  191. }
  192. // Now that we have all the pieces, we can concatenate them using a StringBuilder.
  193. StringBuilder builder;
  194. // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
  195. PrimitiveString const* previous = nullptr;
  196. for (auto const* current : pieces) {
  197. if (!previous) {
  198. // This is the very first piece, just append it and continue.
  199. builder.append(current->string());
  200. previous = current;
  201. continue;
  202. }
  203. // Get the UTF-8 representations for both strings.
  204. auto const& previous_string_as_utf8 = previous->string();
  205. auto const& current_string_as_utf8 = current->string();
  206. // NOTE: Now we need to look at the end of the previous string and the start
  207. // of the current string, to see if they should be combined into a surrogate.
  208. // Surrogates encoded as UTF-8 are 3 bytes.
  209. if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
  210. builder.append(current->string());
  211. previous = current;
  212. continue;
  213. }
  214. // Might the previous string end with a UTF-8 encoded surrogate?
  215. if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
  216. // If not, just append the current string and continue.
  217. builder.append(current->string());
  218. previous = current;
  219. continue;
  220. }
  221. // Might the current string begin with a UTF-8 encoded surrogate?
  222. if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
  223. // If not, just append the current string and continue.
  224. builder.append(current->string());
  225. previous = current;
  226. continue;
  227. }
  228. auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
  229. auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
  230. if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
  231. builder.append(current->string());
  232. previous = current;
  233. continue;
  234. }
  235. // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
  236. builder.trim(3);
  237. builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
  238. // Append the remaining part of the current string.
  239. builder.append(current_string_as_utf8.substring_view(3));
  240. previous = current;
  241. }
  242. m_utf8_string = builder.to_string();
  243. m_has_utf8_string = true;
  244. m_is_rope = false;
  245. m_lhs = nullptr;
  246. m_rhs = nullptr;
  247. }
  248. }