String.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. /*
  2. * Copyright (c) 2018-2022, Andreas Kling <andreas@ladybird.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Array.h>
  7. #include <AK/Checked.h>
  8. #include <AK/Endian.h>
  9. #include <AK/FlyString.h>
  10. #include <AK/Format.h>
  11. #include <AK/MemMem.h>
  12. #include <AK/Stream.h>
  13. #include <AK/String.h>
  14. #include <AK/Utf16View.h>
  15. #include <AK/Vector.h>
  16. #include <stdlib.h>
  17. #include <simdutf.h>
  18. namespace AK {
  19. String String::from_utf8_with_replacement_character(StringView view, WithBOMHandling with_bom_handling)
  20. {
  21. if (auto bytes = view.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
  22. view = view.substring_view(3);
  23. if (Utf8View(view).validate())
  24. return String::from_utf8_without_validation(view.bytes());
  25. StringBuilder builder;
  26. for (auto c : Utf8View { view })
  27. builder.append_code_point(c);
  28. return builder.to_string_without_validation();
  29. }
  30. String String::from_utf8_without_validation(ReadonlyBytes bytes)
  31. {
  32. String result;
  33. MUST(result.replace_with_new_string(bytes.size(), [&](Bytes buffer) {
  34. bytes.copy_to(buffer);
  35. return ErrorOr<void> {};
  36. }));
  37. return result;
  38. }
  39. ErrorOr<String> String::from_utf8(StringView view)
  40. {
  41. if (!Utf8View { view }.validate())
  42. return Error::from_string_literal("String::from_utf8: Input was not valid UTF-8");
  43. String result;
  44. TRY(result.replace_with_new_string(view.length(), [&](Bytes buffer) {
  45. view.bytes().copy_to(buffer);
  46. return ErrorOr<void> {};
  47. }));
  48. return result;
  49. }
  50. ErrorOr<String> String::from_utf16(Utf16View const& utf16)
  51. {
  52. if (!utf16.validate())
  53. return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
  54. if (utf16.is_empty())
  55. return String {};
  56. String result;
  57. auto utf8_length = [&]() {
  58. switch (utf16.endianness()) {
  59. case Endianness::Host:
  60. return simdutf::utf8_length_from_utf16(utf16.char_data(), utf16.length_in_code_units());
  61. case Endianness::Big:
  62. return simdutf::utf8_length_from_utf16be(utf16.char_data(), utf16.length_in_code_units());
  63. case Endianness::Little:
  64. return simdutf::utf8_length_from_utf16le(utf16.char_data(), utf16.length_in_code_units());
  65. }
  66. VERIFY_NOT_REACHED();
  67. }();
  68. TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
  69. [[maybe_unused]] auto result = [&]() {
  70. switch (utf16.endianness()) {
  71. case Endianness::Host:
  72. return simdutf::convert_utf16_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
  73. case Endianness::Big:
  74. return simdutf::convert_utf16be_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
  75. case Endianness::Little:
  76. return simdutf::convert_utf16le_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
  77. }
  78. VERIFY_NOT_REACHED();
  79. }();
  80. ASSERT(result == buffer.size());
  81. return {};
  82. }));
  83. return result;
  84. }
  85. ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
  86. {
  87. String result;
  88. TRY(result.replace_with_new_string(byte_count, [&](Bytes buffer) -> ErrorOr<void> {
  89. TRY(stream.read_until_filled(buffer));
  90. if (!Utf8View { StringView { buffer } }.validate())
  91. return Error::from_string_literal("String::from_stream: Input was not valid UTF-8");
  92. return {};
  93. }));
  94. return result;
  95. }
  96. ErrorOr<String> String::from_string_builder(Badge<StringBuilder>, StringBuilder& builder)
  97. {
  98. if (!Utf8View { builder.string_view() }.validate())
  99. return Error::from_string_literal("String::from_string_builder: Input was not valid UTF-8");
  100. String result;
  101. result.replace_with_string_builder(builder);
  102. return result;
  103. }
  104. String String::from_string_builder_without_validation(Badge<StringBuilder>, StringBuilder& builder)
  105. {
  106. String result;
  107. result.replace_with_string_builder(builder);
  108. return result;
  109. }
  110. ErrorOr<String> String::repeated(u32 code_point, size_t count)
  111. {
  112. VERIFY(is_unicode(code_point));
  113. Array<u8, 4> code_point_as_utf8;
  114. size_t i = 0;
  115. size_t code_point_byte_length = UnicodeUtils::code_point_to_utf8(code_point, [&](auto byte) {
  116. code_point_as_utf8[i++] = static_cast<u8>(byte);
  117. });
  118. auto total_byte_count = code_point_byte_length * count;
  119. String result;
  120. TRY(result.replace_with_new_string(total_byte_count, [&](Bytes buffer) {
  121. if (code_point_byte_length == 1) {
  122. buffer.fill(code_point_as_utf8[0]);
  123. } else {
  124. for (i = 0; i < count; ++i)
  125. memcpy(buffer.data() + (i * code_point_byte_length), code_point_as_utf8.data(), code_point_byte_length);
  126. }
  127. return ErrorOr<void> {};
  128. }));
  129. return result;
  130. }
  131. StringView String::bytes_as_string_view() const&
  132. {
  133. return StringView(bytes());
  134. }
  135. bool String::is_empty() const
  136. {
  137. return bytes().size() == 0;
  138. }
  139. ErrorOr<String> String::vformatted(StringView fmtstr, TypeErasedFormatParams& params)
  140. {
  141. StringBuilder builder;
  142. TRY(vformat(builder, fmtstr, params));
  143. return builder.to_string();
  144. }
  145. ErrorOr<Vector<String>> String::split(u32 separator, SplitBehavior split_behavior) const
  146. {
  147. return split_limit(separator, 0, split_behavior);
  148. }
  149. ErrorOr<Vector<String>> String::split_limit(u32 separator, size_t limit, SplitBehavior split_behavior) const
  150. {
  151. Vector<String> result;
  152. if (is_empty())
  153. return result;
  154. bool keep_empty = has_flag(split_behavior, SplitBehavior::KeepEmpty);
  155. size_t substring_start = 0;
  156. for (auto it = code_points().begin(); it != code_points().end() && (result.size() + 1) != limit; ++it) {
  157. u32 code_point = *it;
  158. if (code_point == separator) {
  159. size_t substring_length = code_points().iterator_offset(it) - substring_start;
  160. if (substring_length != 0 || keep_empty)
  161. TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, substring_length))));
  162. substring_start = code_points().iterator_offset(it) + it.underlying_code_point_length_in_bytes();
  163. }
  164. }
  165. size_t tail_length = code_points().byte_length() - substring_start;
  166. if (tail_length != 0 || keep_empty)
  167. TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, tail_length))));
  168. return result;
  169. }
  170. Optional<size_t> String::find_byte_offset(u32 code_point, size_t from_byte_offset) const
  171. {
  172. auto code_points = this->code_points();
  173. if (from_byte_offset >= code_points.byte_length())
  174. return {};
  175. for (auto it = code_points.iterator_at_byte_offset(from_byte_offset); it != code_points.end(); ++it) {
  176. if (*it == code_point)
  177. return code_points.byte_offset_of(it);
  178. }
  179. return {};
  180. }
  181. Optional<size_t> String::find_byte_offset(StringView substring, size_t from_byte_offset) const
  182. {
  183. auto view = bytes_as_string_view();
  184. if (from_byte_offset >= view.length())
  185. return {};
  186. auto index = memmem_optional(
  187. view.characters_without_null_termination() + from_byte_offset, view.length() - from_byte_offset,
  188. substring.characters_without_null_termination(), substring.length());
  189. if (index.has_value())
  190. return *index + from_byte_offset;
  191. return {};
  192. }
  193. bool String::operator==(FlyString const& other) const
  194. {
  195. return static_cast<StringBase const&>(*this) == other.data({});
  196. }
  197. bool String::operator==(StringView other) const
  198. {
  199. return bytes_as_string_view() == other;
  200. }
  201. ErrorOr<String> String::substring_from_byte_offset(size_t start, size_t byte_count) const
  202. {
  203. if (!byte_count)
  204. return String {};
  205. return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
  206. }
  207. ErrorOr<String> String::substring_from_byte_offset(size_t start) const
  208. {
  209. VERIFY(start <= bytes_as_string_view().length());
  210. return substring_from_byte_offset(start, bytes_as_string_view().length() - start);
  211. }
  212. ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const
  213. {
  214. return String { TRY(StringBase::substring_from_byte_offset_with_shared_superstring(start, byte_count)) };
  215. }
  216. ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start) const
  217. {
  218. VERIFY(start <= bytes_as_string_view().length());
  219. return substring_from_byte_offset_with_shared_superstring(start, bytes_as_string_view().length() - start);
  220. }
  221. bool String::operator==(char const* c_string) const
  222. {
  223. return bytes_as_string_view() == c_string;
  224. }
  225. u32 String::ascii_case_insensitive_hash() const
  226. {
  227. return case_insensitive_string_hash(reinterpret_cast<char const*>(bytes().data()), bytes().size());
  228. }
  229. Utf8View String::code_points() const&
  230. {
  231. return Utf8View(bytes_as_string_view());
  232. }
  233. ErrorOr<void> Formatter<String>::format(FormatBuilder& builder, String const& utf8_string)
  234. {
  235. return Formatter<StringView>::format(builder, utf8_string.bytes_as_string_view());
  236. }
  237. ErrorOr<String> String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const
  238. {
  239. return StringUtils::replace(*this, needle, replacement, replace_mode);
  240. }
  241. ErrorOr<String> String::reverse() const
  242. {
  243. // FIXME: This handles multi-byte code points, but not e.g. grapheme clusters.
  244. // FIXME: We could avoid allocating a temporary vector if Utf8View supports reverse iteration.
  245. auto code_point_length = code_points().length();
  246. Vector<u32> code_points;
  247. TRY(code_points.try_ensure_capacity(code_point_length));
  248. for (auto code_point : this->code_points())
  249. code_points.unchecked_append(code_point);
  250. auto builder = TRY(StringBuilder::create(code_point_length * sizeof(u32)));
  251. while (!code_points.is_empty())
  252. TRY(builder.try_append_code_point(code_points.take_last()));
  253. return builder.to_string();
  254. }
  255. ErrorOr<String> String::trim(Utf8View const& code_points_to_trim, TrimMode mode) const
  256. {
  257. auto trimmed = code_points().trim(code_points_to_trim, mode);
  258. return String::from_utf8(trimmed.as_string());
  259. }
  260. ErrorOr<String> String::trim(StringView code_points_to_trim, TrimMode mode) const
  261. {
  262. return trim(Utf8View { code_points_to_trim }, mode);
  263. }
  264. ErrorOr<String> String::trim_ascii_whitespace(TrimMode mode) const
  265. {
  266. return trim(" \n\t\v\f\r"sv, mode);
  267. }
  268. bool String::contains(StringView needle, CaseSensitivity case_sensitivity) const
  269. {
  270. return StringUtils::contains(bytes_as_string_view(), needle, case_sensitivity);
  271. }
  272. bool String::contains(u32 needle, CaseSensitivity case_sensitivity) const
  273. {
  274. auto needle_as_string = String::from_code_point(needle);
  275. return contains(needle_as_string.bytes_as_string_view(), case_sensitivity);
  276. }
  277. bool String::starts_with(u32 code_point) const
  278. {
  279. if (is_empty())
  280. return false;
  281. return *code_points().begin() == code_point;
  282. }
  283. bool String::starts_with_bytes(StringView bytes, CaseSensitivity case_sensitivity) const
  284. {
  285. return bytes_as_string_view().starts_with(bytes, case_sensitivity);
  286. }
  287. bool String::ends_with(u32 code_point) const
  288. {
  289. if (is_empty())
  290. return false;
  291. u32 last_code_point = 0;
  292. for (auto it = code_points().begin(); it != code_points().end(); ++it)
  293. last_code_point = *it;
  294. return last_code_point == code_point;
  295. }
  296. bool String::ends_with_bytes(StringView bytes, CaseSensitivity case_sensitivity) const
  297. {
  298. return bytes_as_string_view().ends_with(bytes, case_sensitivity);
  299. }
  300. unsigned Traits<String>::hash(String const& string)
  301. {
  302. return string.hash();
  303. }
  304. ByteString String::to_byte_string() const
  305. {
  306. return ByteString(bytes_as_string_view());
  307. }
  308. ErrorOr<String> String::from_byte_string(ByteString const& byte_string)
  309. {
  310. return String::from_utf8(byte_string.view());
  311. }
  312. bool String::equals_ignoring_ascii_case(StringView other) const
  313. {
  314. return StringUtils::equals_ignoring_ascii_case(bytes_as_string_view(), other);
  315. }
  316. ErrorOr<String> String::repeated(String const& input, size_t count)
  317. {
  318. if (Checked<u32>::multiplication_would_overflow(count, input.bytes().size()))
  319. return Error::from_errno(EOVERFLOW);
  320. String result;
  321. size_t input_size = input.bytes().size();
  322. TRY(result.replace_with_new_string(count * input_size, [&](Bytes buffer) {
  323. if (input_size == 1) {
  324. buffer.fill(input.bytes().first());
  325. } else {
  326. for (size_t i = 0; i < count; ++i)
  327. input.bytes().copy_to(buffer.slice(i * input_size, input_size));
  328. }
  329. return ErrorOr<void> {};
  330. }));
  331. return result;
  332. }
  333. }