String.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. /*
  2. * Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Array.h>
  7. #include <AK/Checked.h>
  8. #include <AK/FlyString.h>
  9. #include <AK/Format.h>
  10. #include <AK/MemMem.h>
  11. #include <AK/String.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Utf8View.h>
  14. #include <AK/Vector.h>
  15. #include <stdlib.h>
  16. namespace AK {
  17. namespace Detail {
  18. class StringData final : public RefCounted<StringData> {
  19. public:
  20. static ErrorOr<NonnullRefPtr<StringData>> create_uninitialized(size_t, u8*& buffer);
  21. static ErrorOr<NonnullRefPtr<StringData>> create_substring(StringData const& superstring, size_t start, size_t byte_count);
  22. static ErrorOr<NonnullRefPtr<StringData>> from_utf8(char const* utf8_bytes, size_t);
  23. struct SubstringData {
  24. StringData const* superstring { nullptr };
  25. u32 start_offset { 0 };
  26. };
  27. void operator delete(void* ptr);
  28. ~StringData();
  29. SubstringData const& substring_data() const
  30. {
  31. return *reinterpret_cast<SubstringData const*>(m_bytes_or_substring_data);
  32. }
  33. // NOTE: There is no guarantee about null-termination.
  34. ReadonlyBytes bytes() const
  35. {
  36. if (m_substring) {
  37. auto const& data = substring_data();
  38. return data.superstring->bytes().slice(data.start_offset, m_byte_count);
  39. }
  40. return { &m_bytes_or_substring_data[0], m_byte_count };
  41. }
  42. StringView bytes_as_string_view() const { return { bytes() }; }
  43. bool operator==(StringData const& other) const
  44. {
  45. return bytes_as_string_view() == other.bytes_as_string_view();
  46. }
  47. unsigned hash() const
  48. {
  49. if (!m_has_hash)
  50. compute_hash();
  51. return m_hash;
  52. }
  53. bool is_fly_string() const { return m_is_fly_string; }
  54. void set_fly_string(bool is_fly_string) { m_is_fly_string = is_fly_string; }
  55. private:
  56. explicit StringData(size_t byte_count);
  57. StringData(StringData const& superstring, size_t start, size_t byte_count);
  58. void compute_hash() const;
  59. u32 m_byte_count { 0 };
  60. mutable unsigned m_hash { 0 };
  61. mutable bool m_has_hash { false };
  62. bool m_substring { false };
  63. bool m_is_fly_string { false };
  64. u8 m_bytes_or_substring_data[0];
  65. };
  66. void StringData::operator delete(void* ptr)
  67. {
  68. free(ptr);
  69. }
  70. StringData::StringData(size_t byte_count)
  71. : m_byte_count(byte_count)
  72. {
  73. }
  74. StringData::StringData(StringData const& superstring, size_t start, size_t byte_count)
  75. : m_byte_count(byte_count)
  76. , m_substring(true)
  77. {
  78. auto& data = const_cast<SubstringData&>(substring_data());
  79. data.start_offset = start;
  80. data.superstring = &superstring;
  81. superstring.ref();
  82. }
  83. StringData::~StringData()
  84. {
  85. if (m_is_fly_string)
  86. FlyString::did_destroy_fly_string_data({}, bytes_as_string_view());
  87. if (m_substring)
  88. substring_data().superstring->unref();
  89. }
  90. constexpr size_t allocation_size_for_string_data(size_t length)
  91. {
  92. return sizeof(StringData) + (sizeof(char) * length);
  93. }
  94. ErrorOr<NonnullRefPtr<StringData>> StringData::create_uninitialized(size_t byte_count, u8*& buffer)
  95. {
  96. VERIFY(byte_count);
  97. void* slot = malloc(allocation_size_for_string_data(byte_count));
  98. if (!slot) {
  99. return Error::from_errno(ENOMEM);
  100. }
  101. auto new_string_data = adopt_ref(*new (slot) StringData(byte_count));
  102. buffer = const_cast<u8*>(new_string_data->bytes().data());
  103. return new_string_data;
  104. }
  105. ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, size_t byte_count)
  106. {
  107. // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
  108. VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
  109. Utf8View view(StringView(utf8_data, byte_count));
  110. if (!view.validate())
  111. return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8");
  112. VERIFY(utf8_data);
  113. u8* buffer = nullptr;
  114. auto new_string_data = TRY(create_uninitialized(byte_count, buffer));
  115. memcpy(buffer, utf8_data, byte_count * sizeof(char));
  116. return new_string_data;
  117. }
  118. ErrorOr<NonnullRefPtr<StringData>> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count)
  119. {
  120. // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
  121. VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
  122. void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData));
  123. if (!slot) {
  124. return Error::from_errno(ENOMEM);
  125. }
  126. return adopt_ref(*new (slot) StringData(superstring, start, byte_count));
  127. }
  128. void StringData::compute_hash() const
  129. {
  130. auto bytes = this->bytes();
  131. if (bytes.size() == 0)
  132. m_hash = 0;
  133. else
  134. m_hash = string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
  135. m_has_hash = true;
  136. }
  137. }
  138. String::String(NonnullRefPtr<Detail::StringData> data)
  139. : m_data(&data.leak_ref())
  140. {
  141. }
  142. String::String(String const& other)
  143. : m_data(other.m_data)
  144. {
  145. if (!is_short_string())
  146. m_data->ref();
  147. }
  148. String::String(String&& other)
  149. : m_data(exchange(other.m_data, nullptr))
  150. {
  151. other.m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG;
  152. }
  153. String& String::operator=(String&& other)
  154. {
  155. if (!is_short_string())
  156. m_data->unref();
  157. m_data = exchange(other.m_data, nullptr);
  158. other.m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG;
  159. return *this;
  160. }
  161. String& String::operator=(String const& other)
  162. {
  163. if (&other != this) {
  164. m_data = other.m_data;
  165. if (!is_short_string())
  166. m_data->ref();
  167. }
  168. return *this;
  169. }
  170. void String::destroy_string()
  171. {
  172. if (!is_short_string())
  173. m_data->unref();
  174. }
  175. ErrorOr<String> String::from_utf8(StringView view)
  176. {
  177. if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) {
  178. ShortString short_string;
  179. if (!view.is_empty())
  180. memcpy(short_string.storage, view.characters_without_null_termination(), view.length());
  181. short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG;
  182. return String { short_string };
  183. }
  184. auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length()));
  185. return String { move(data) };
  186. }
  187. ErrorOr<String> String::repeated(u32 code_point, size_t count)
  188. {
  189. VERIFY(is_unicode(code_point));
  190. Array<u8, 4> code_point_as_utf8;
  191. size_t i = 0;
  192. size_t code_point_byte_length = UnicodeUtils::code_point_to_utf8(code_point, [&](auto byte) {
  193. code_point_as_utf8[i++] = static_cast<u8>(byte);
  194. });
  195. auto copy_to_buffer = [&](u8* buffer) {
  196. if (code_point_byte_length == 1) {
  197. memset(buffer, code_point_as_utf8[0], count);
  198. return;
  199. }
  200. for (i = 0; i < count; ++i)
  201. memcpy(buffer + (i * code_point_byte_length), code_point_as_utf8.data(), code_point_byte_length);
  202. };
  203. auto total_byte_count = code_point_byte_length * count;
  204. if (total_byte_count <= MAX_SHORT_STRING_BYTE_COUNT) {
  205. ShortString short_string;
  206. copy_to_buffer(short_string.storage);
  207. short_string.byte_count_and_short_string_flag = (total_byte_count << 1) | SHORT_STRING_FLAG;
  208. return String { short_string };
  209. }
  210. u8* buffer = nullptr;
  211. auto new_string_data = TRY(Detail::StringData::create_uninitialized(total_byte_count, buffer));
  212. copy_to_buffer(buffer);
  213. return String { move(new_string_data) };
  214. }
  215. StringView String::bytes_as_string_view() const
  216. {
  217. return StringView(bytes());
  218. }
  219. ReadonlyBytes String::bytes() const
  220. {
  221. if (is_short_string())
  222. return m_short_string.bytes();
  223. return m_data->bytes();
  224. }
  225. bool String::is_empty() const
  226. {
  227. return bytes().size() == 0;
  228. }
  229. ErrorOr<String> String::vformatted(StringView fmtstr, TypeErasedFormatParams& params)
  230. {
  231. StringBuilder builder;
  232. TRY(vformat(builder, fmtstr, params));
  233. return builder.to_string();
  234. }
  235. ErrorOr<Vector<String>> String::split(u32 separator, SplitBehavior split_behavior) const
  236. {
  237. return split_limit(separator, 0, split_behavior);
  238. }
  239. ErrorOr<Vector<String>> String::split_limit(u32 separator, size_t limit, SplitBehavior split_behavior) const
  240. {
  241. Vector<String> result;
  242. if (is_empty())
  243. return result;
  244. bool keep_empty = has_flag(split_behavior, SplitBehavior::KeepEmpty);
  245. size_t substring_start = 0;
  246. for (auto it = code_points().begin(); it != code_points().end() && (result.size() + 1) != limit; ++it) {
  247. u32 code_point = *it;
  248. if (code_point == separator) {
  249. size_t substring_length = code_points().iterator_offset(it) - substring_start;
  250. if (substring_length != 0 || keep_empty)
  251. TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, substring_length))));
  252. substring_start = code_points().iterator_offset(it) + it.underlying_code_point_length_in_bytes();
  253. }
  254. }
  255. size_t tail_length = code_points().byte_length() - substring_start;
  256. if (tail_length != 0 || keep_empty)
  257. TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, tail_length))));
  258. return result;
  259. }
  260. Optional<size_t> String::find_byte_offset(u32 code_point, size_t from_byte_offset) const
  261. {
  262. auto code_points = this->code_points();
  263. if (from_byte_offset >= code_points.byte_length())
  264. return {};
  265. for (auto it = code_points.iterator_at_byte_offset(from_byte_offset); it != code_points.end(); ++it) {
  266. if (*it == code_point)
  267. return code_points.byte_offset_of(it);
  268. }
  269. return {};
  270. }
  271. Optional<size_t> String::find_byte_offset(StringView substring, size_t from_byte_offset) const
  272. {
  273. auto view = bytes_as_string_view();
  274. if (from_byte_offset >= view.length())
  275. return {};
  276. auto index = memmem_optional(
  277. view.characters_without_null_termination() + from_byte_offset, view.length() - from_byte_offset,
  278. substring.characters_without_null_termination(), substring.length());
  279. if (index.has_value())
  280. return *index + from_byte_offset;
  281. return {};
  282. }
  283. bool String::operator==(String const& other) const
  284. {
  285. if (is_short_string())
  286. return m_data == other.m_data;
  287. return bytes_as_string_view() == other.bytes_as_string_view();
  288. }
  289. bool String::operator==(FlyString const& other) const
  290. {
  291. if (reinterpret_cast<uintptr_t>(m_data) == other.data({}))
  292. return true;
  293. return bytes_as_string_view() == other.bytes_as_string_view();
  294. }
  295. bool String::operator==(StringView other) const
  296. {
  297. return bytes_as_string_view() == other;
  298. }
  299. ErrorOr<String> String::substring_from_byte_offset(size_t start, size_t byte_count) const
  300. {
  301. if (!byte_count)
  302. return String {};
  303. return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
  304. }
  305. ErrorOr<String> String::substring_from_byte_offset(size_t start) const
  306. {
  307. VERIFY(start <= bytes_as_string_view().length());
  308. return substring_from_byte_offset(start, bytes_as_string_view().length() - start);
  309. }
  310. ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const
  311. {
  312. if (!byte_count)
  313. return String {};
  314. if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT)
  315. return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
  316. return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) };
  317. }
  318. ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start) const
  319. {
  320. VERIFY(start <= bytes_as_string_view().length());
  321. return substring_from_byte_offset_with_shared_superstring(start, bytes_as_string_view().length() - start);
  322. }
  323. bool String::operator==(char const* c_string) const
  324. {
  325. return bytes_as_string_view() == c_string;
  326. }
  327. u32 String::hash() const
  328. {
  329. if (is_short_string()) {
  330. auto bytes = this->bytes();
  331. return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
  332. }
  333. return m_data->hash();
  334. }
  335. Utf8View String::code_points() const
  336. {
  337. return Utf8View(bytes_as_string_view());
  338. }
  339. ErrorOr<void> Formatter<String>::format(FormatBuilder& builder, String const& utf8_string)
  340. {
  341. return Formatter<StringView>::format(builder, utf8_string.bytes_as_string_view());
  342. }
  343. ErrorOr<String> String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const
  344. {
  345. return StringUtils::replace(*this, needle, replacement, replace_mode);
  346. }
  347. ErrorOr<String> String::reverse() const
  348. {
  349. // FIXME: This handles multi-byte code points, but not e.g. grapheme clusters.
  350. // FIXME: We could avoid allocating a temporary vector if Utf8View supports reverse iteration.
  351. auto code_point_length = code_points().length();
  352. Vector<u32> code_points;
  353. TRY(code_points.try_ensure_capacity(code_point_length));
  354. for (auto code_point : this->code_points())
  355. code_points.unchecked_append(code_point);
  356. auto builder = TRY(StringBuilder::create(code_point_length * sizeof(u32)));
  357. while (!code_points.is_empty())
  358. TRY(builder.try_append_code_point(code_points.take_last()));
  359. return builder.to_string();
  360. }
  361. bool String::contains(StringView needle, CaseSensitivity case_sensitivity) const
  362. {
  363. return StringUtils::contains(bytes_as_string_view(), needle, case_sensitivity);
  364. }
  365. bool String::contains(char needle, CaseSensitivity case_sensitivity) const
  366. {
  367. return contains(StringView { &needle, 1 }, case_sensitivity);
  368. }
  369. bool String::is_short_string() const
  370. {
  371. return has_short_string_bit(reinterpret_cast<uintptr_t>(m_data));
  372. }
  373. ReadonlyBytes String::ShortString::bytes() const
  374. {
  375. return { storage, byte_count() };
  376. }
  377. size_t String::ShortString::byte_count() const
  378. {
  379. return byte_count_and_short_string_flag >> 1;
  380. }
  381. unsigned Traits<String>::hash(String const& string)
  382. {
  383. return string.hash();
  384. }
  385. String String::fly_string_data_to_string(Badge<FlyString>, uintptr_t const& data)
  386. {
  387. if (has_short_string_bit(data))
  388. return String { *reinterpret_cast<ShortString const*>(&data) };
  389. auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
  390. return String { NonnullRefPtr<Detail::StringData>(*string_data) };
  391. }
  392. StringView String::fly_string_data_to_string_view(Badge<FlyString>, uintptr_t const& data)
  393. {
  394. if (has_short_string_bit(data)) {
  395. auto const* short_string = reinterpret_cast<ShortString const*>(&data);
  396. return short_string->bytes();
  397. }
  398. auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
  399. return string_data->bytes_as_string_view();
  400. }
  401. uintptr_t String::to_fly_string_data(Badge<FlyString>) const
  402. {
  403. return reinterpret_cast<uintptr_t>(m_data);
  404. }
  405. void String::ref_fly_string_data(Badge<FlyString>, uintptr_t data)
  406. {
  407. if (has_short_string_bit(data))
  408. return;
  409. auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
  410. string_data->ref();
  411. }
  412. void String::unref_fly_string_data(Badge<FlyString>, uintptr_t data)
  413. {
  414. if (has_short_string_bit(data))
  415. return;
  416. auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
  417. string_data->unref();
  418. }
  419. void String::did_create_fly_string(Badge<FlyString>) const
  420. {
  421. VERIFY(!is_short_string());
  422. m_data->set_fly_string(true);
  423. }
  424. DeprecatedString String::to_deprecated_string() const
  425. {
  426. return DeprecatedString(bytes_as_string_view());
  427. }
  428. ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string)
  429. {
  430. Utf8View view(deprecated_string);
  431. if (!view.validate())
  432. return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8");
  433. return String::from_utf8(deprecated_string.view());
  434. }
  435. }