StringUtils.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. /*
  2. * Copyright (c) 2018-2022, Andreas Kling <awesomekling@gmail.com>
  3. * Copyright (c) 2020, Fei Wu <f.eiwu@yahoo.com>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/MemMem.h>
  9. #include <AK/Optional.h>
  10. #include <AK/String.h>
  11. #include <AK/StringBuilder.h>
  12. #include <AK/StringUtils.h>
  13. #include <AK/StringView.h>
  14. #include <AK/Vector.h>
  15. #ifdef KERNEL
  16. # include <Kernel/StdLib.h>
  17. #else
  18. # include <AK/DeprecatedString.h>
  19. # include <AK/FloatingPointStringConversions.h>
  20. # include <string.h>
  21. #endif
  22. namespace AK {
  23. namespace StringUtils {
  24. bool matches(StringView str, StringView mask, CaseSensitivity case_sensitivity, Vector<MaskSpan>* match_spans)
  25. {
  26. auto record_span = [&match_spans](size_t start, size_t length) {
  27. if (match_spans)
  28. match_spans->append({ start, length });
  29. };
  30. if (str.is_null() || mask.is_null())
  31. return str.is_null() && mask.is_null();
  32. if (mask == "*"sv) {
  33. record_span(0, str.length());
  34. return true;
  35. }
  36. char const* string_ptr = str.characters_without_null_termination();
  37. char const* string_start = str.characters_without_null_termination();
  38. char const* string_end = string_ptr + str.length();
  39. char const* mask_ptr = mask.characters_without_null_termination();
  40. char const* mask_end = mask_ptr + mask.length();
  41. while (string_ptr < string_end && mask_ptr < mask_end) {
  42. auto string_start_ptr = string_ptr;
  43. switch (*mask_ptr) {
  44. case '*':
  45. if (mask_ptr == mask_end - 1) {
  46. record_span(string_ptr - string_start, string_end - string_ptr);
  47. return true;
  48. }
  49. while (string_ptr < string_end && !matches({ string_ptr, static_cast<size_t>(string_end - string_ptr) }, { mask_ptr + 1, static_cast<size_t>(mask_end - mask_ptr - 1) }, case_sensitivity))
  50. ++string_ptr;
  51. record_span(string_start_ptr - string_start, string_ptr - string_start_ptr);
  52. --string_ptr;
  53. break;
  54. case '?':
  55. record_span(string_ptr - string_start, 1);
  56. break;
  57. case '\\':
  58. // if backslash is last character in mask, just treat it as an exact match
  59. // otherwise use it as escape for next character
  60. if (mask_ptr + 1 < mask_end)
  61. ++mask_ptr;
  62. [[fallthrough]];
  63. default:
  64. auto p = *mask_ptr;
  65. auto ch = *string_ptr;
  66. if (case_sensitivity == CaseSensitivity::CaseSensitive ? p != ch : to_ascii_lowercase(p) != to_ascii_lowercase(ch))
  67. return false;
  68. break;
  69. }
  70. ++string_ptr;
  71. ++mask_ptr;
  72. }
  73. if (string_ptr == string_end) {
  74. // Allow ending '*' to contain nothing.
  75. while (mask_ptr != mask_end && *mask_ptr == '*') {
  76. record_span(string_ptr - string_start, 0);
  77. ++mask_ptr;
  78. }
  79. }
  80. return string_ptr == string_end && mask_ptr == mask_end;
  81. }
  82. template<typename T>
  83. Optional<T> convert_to_int(StringView str, TrimWhitespace trim_whitespace)
  84. {
  85. auto string = trim_whitespace == TrimWhitespace::Yes
  86. ? str.trim_whitespace()
  87. : str;
  88. if (string.is_empty())
  89. return {};
  90. T sign = 1;
  91. size_t i = 0;
  92. auto const characters = string.characters_without_null_termination();
  93. if (characters[0] == '-' || characters[0] == '+') {
  94. if (string.length() == 1)
  95. return {};
  96. i++;
  97. if (characters[0] == '-')
  98. sign = -1;
  99. }
  100. T value = 0;
  101. for (; i < string.length(); i++) {
  102. if (characters[i] < '0' || characters[i] > '9')
  103. return {};
  104. if (__builtin_mul_overflow(value, 10, &value))
  105. return {};
  106. if (__builtin_add_overflow(value, sign * (characters[i] - '0'), &value))
  107. return {};
  108. }
  109. return value;
  110. }
  111. template Optional<i8> convert_to_int(StringView str, TrimWhitespace);
  112. template Optional<i16> convert_to_int(StringView str, TrimWhitespace);
  113. template Optional<i32> convert_to_int(StringView str, TrimWhitespace);
  114. template Optional<long> convert_to_int(StringView str, TrimWhitespace);
  115. template Optional<long long> convert_to_int(StringView str, TrimWhitespace);
  116. template<typename T>
  117. Optional<T> convert_to_uint(StringView str, TrimWhitespace trim_whitespace)
  118. {
  119. auto string = trim_whitespace == TrimWhitespace::Yes
  120. ? str.trim_whitespace()
  121. : str;
  122. if (string.is_empty())
  123. return {};
  124. T value = 0;
  125. auto const characters = string.characters_without_null_termination();
  126. for (size_t i = 0; i < string.length(); i++) {
  127. if (characters[i] < '0' || characters[i] > '9')
  128. return {};
  129. if (__builtin_mul_overflow(value, 10, &value))
  130. return {};
  131. if (__builtin_add_overflow(value, characters[i] - '0', &value))
  132. return {};
  133. }
  134. return value;
  135. }
  136. template Optional<u8> convert_to_uint(StringView str, TrimWhitespace);
  137. template Optional<u16> convert_to_uint(StringView str, TrimWhitespace);
  138. template Optional<u32> convert_to_uint(StringView str, TrimWhitespace);
  139. template Optional<unsigned long> convert_to_uint(StringView str, TrimWhitespace);
  140. template Optional<unsigned long long> convert_to_uint(StringView str, TrimWhitespace);
  141. template<typename T>
  142. Optional<T> convert_to_uint_from_hex(StringView str, TrimWhitespace trim_whitespace)
  143. {
  144. auto string = trim_whitespace == TrimWhitespace::Yes
  145. ? str.trim_whitespace()
  146. : str;
  147. if (string.is_empty())
  148. return {};
  149. T value = 0;
  150. auto const count = string.length();
  151. const T upper_bound = NumericLimits<T>::max();
  152. for (size_t i = 0; i < count; i++) {
  153. char digit = string[i];
  154. u8 digit_val;
  155. if (value > (upper_bound >> 4))
  156. return {};
  157. if (digit >= '0' && digit <= '9') {
  158. digit_val = digit - '0';
  159. } else if (digit >= 'a' && digit <= 'f') {
  160. digit_val = 10 + (digit - 'a');
  161. } else if (digit >= 'A' && digit <= 'F') {
  162. digit_val = 10 + (digit - 'A');
  163. } else {
  164. return {};
  165. }
  166. value = (value << 4) + digit_val;
  167. }
  168. return value;
  169. }
  170. template Optional<u8> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  171. template Optional<u16> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  172. template Optional<u32> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  173. template Optional<u64> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  174. template<typename T>
  175. Optional<T> convert_to_uint_from_octal(StringView str, TrimWhitespace trim_whitespace)
  176. {
  177. auto string = trim_whitespace == TrimWhitespace::Yes
  178. ? str.trim_whitespace()
  179. : str;
  180. if (string.is_empty())
  181. return {};
  182. T value = 0;
  183. auto const count = string.length();
  184. const T upper_bound = NumericLimits<T>::max();
  185. for (size_t i = 0; i < count; i++) {
  186. char digit = string[i];
  187. u8 digit_val;
  188. if (value > (upper_bound >> 3))
  189. return {};
  190. if (digit >= '0' && digit <= '7') {
  191. digit_val = digit - '0';
  192. } else {
  193. return {};
  194. }
  195. value = (value << 3) + digit_val;
  196. }
  197. return value;
  198. }
  199. template Optional<u8> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  200. template Optional<u16> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  201. template Optional<u32> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  202. template Optional<u64> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  203. #ifndef KERNEL
  204. template<typename T>
  205. Optional<T> convert_to_floating_point(StringView str, TrimWhitespace trim_whitespace)
  206. {
  207. static_assert(IsSame<T, double> || IsSame<T, float>);
  208. auto string = trim_whitespace == TrimWhitespace::Yes
  209. ? str.trim_whitespace()
  210. : str;
  211. char const* start = string.characters_without_null_termination();
  212. return parse_floating_point_completely<T>(start, start + str.length());
  213. }
  214. template Optional<double> convert_to_floating_point(StringView str, TrimWhitespace);
  215. template Optional<float> convert_to_floating_point(StringView str, TrimWhitespace);
  216. #endif
  217. bool equals_ignoring_ascii_case(StringView a, StringView b)
  218. {
  219. if (a.length() != b.length())
  220. return false;
  221. for (size_t i = 0; i < a.length(); ++i) {
  222. if (to_ascii_lowercase(a.characters_without_null_termination()[i]) != to_ascii_lowercase(b.characters_without_null_termination()[i]))
  223. return false;
  224. }
  225. return true;
  226. }
  227. bool ends_with(StringView str, StringView end, CaseSensitivity case_sensitivity)
  228. {
  229. if (end.is_empty())
  230. return true;
  231. if (str.is_empty())
  232. return false;
  233. if (end.length() > str.length())
  234. return false;
  235. if (case_sensitivity == CaseSensitivity::CaseSensitive)
  236. return !memcmp(str.characters_without_null_termination() + (str.length() - end.length()), end.characters_without_null_termination(), end.length());
  237. auto str_chars = str.characters_without_null_termination();
  238. auto end_chars = end.characters_without_null_termination();
  239. size_t si = str.length() - end.length();
  240. for (size_t ei = 0; ei < end.length(); ++si, ++ei) {
  241. if (to_ascii_lowercase(str_chars[si]) != to_ascii_lowercase(end_chars[ei]))
  242. return false;
  243. }
  244. return true;
  245. }
  246. bool starts_with(StringView str, StringView start, CaseSensitivity case_sensitivity)
  247. {
  248. if (start.is_empty())
  249. return true;
  250. if (str.is_empty())
  251. return false;
  252. if (start.length() > str.length())
  253. return false;
  254. if (str.characters_without_null_termination() == start.characters_without_null_termination())
  255. return true;
  256. if (case_sensitivity == CaseSensitivity::CaseSensitive)
  257. return !memcmp(str.characters_without_null_termination(), start.characters_without_null_termination(), start.length());
  258. auto str_chars = str.characters_without_null_termination();
  259. auto start_chars = start.characters_without_null_termination();
  260. size_t si = 0;
  261. for (size_t starti = 0; starti < start.length(); ++si, ++starti) {
  262. if (to_ascii_lowercase(str_chars[si]) != to_ascii_lowercase(start_chars[starti]))
  263. return false;
  264. }
  265. return true;
  266. }
  267. bool contains(StringView str, StringView needle, CaseSensitivity case_sensitivity)
  268. {
  269. if (str.is_null() || needle.is_null() || str.is_empty() || needle.length() > str.length())
  270. return false;
  271. if (needle.is_empty())
  272. return true;
  273. auto str_chars = str.characters_without_null_termination();
  274. auto needle_chars = needle.characters_without_null_termination();
  275. if (case_sensitivity == CaseSensitivity::CaseSensitive)
  276. return memmem(str_chars, str.length(), needle_chars, needle.length()) != nullptr;
  277. auto needle_first = to_ascii_lowercase(needle_chars[0]);
  278. for (size_t si = 0; si < str.length(); si++) {
  279. if (to_ascii_lowercase(str_chars[si]) != needle_first)
  280. continue;
  281. for (size_t ni = 0; si + ni < str.length(); ni++) {
  282. if (to_ascii_lowercase(str_chars[si + ni]) != to_ascii_lowercase(needle_chars[ni])) {
  283. if (ni > 0)
  284. si += ni - 1;
  285. break;
  286. }
  287. if (ni + 1 == needle.length())
  288. return true;
  289. }
  290. }
  291. return false;
  292. }
  293. bool is_whitespace(StringView str)
  294. {
  295. return all_of(str, is_ascii_space);
  296. }
  297. StringView trim(StringView str, StringView characters, TrimMode mode)
  298. {
  299. size_t substring_start = 0;
  300. size_t substring_length = str.length();
  301. if (mode == TrimMode::Left || mode == TrimMode::Both) {
  302. for (size_t i = 0; i < str.length(); ++i) {
  303. if (substring_length == 0)
  304. return ""sv;
  305. if (!characters.contains(str[i]))
  306. break;
  307. ++substring_start;
  308. --substring_length;
  309. }
  310. }
  311. if (mode == TrimMode::Right || mode == TrimMode::Both) {
  312. for (size_t i = str.length(); i > 0; --i) {
  313. if (substring_length == 0)
  314. return ""sv;
  315. if (!characters.contains(str[i - 1]))
  316. break;
  317. --substring_length;
  318. }
  319. }
  320. return str.substring_view(substring_start, substring_length);
  321. }
  322. StringView trim_whitespace(StringView str, TrimMode mode)
  323. {
  324. return trim(str, " \n\t\v\f\r"sv, mode);
  325. }
  326. Optional<size_t> find(StringView haystack, char needle, size_t start)
  327. {
  328. if (start >= haystack.length())
  329. return {};
  330. for (size_t i = start; i < haystack.length(); ++i) {
  331. if (haystack[i] == needle)
  332. return i;
  333. }
  334. return {};
  335. }
  336. Optional<size_t> find(StringView haystack, StringView needle, size_t start)
  337. {
  338. if (start > haystack.length())
  339. return {};
  340. auto index = AK::memmem_optional(
  341. haystack.characters_without_null_termination() + start, haystack.length() - start,
  342. needle.characters_without_null_termination(), needle.length());
  343. return index.has_value() ? (*index + start) : index;
  344. }
  345. Optional<size_t> find_last(StringView haystack, char needle)
  346. {
  347. for (size_t i = haystack.length(); i > 0; --i) {
  348. if (haystack[i - 1] == needle)
  349. return i - 1;
  350. }
  351. return {};
  352. }
  353. Optional<size_t> find_last(StringView haystack, StringView needle)
  354. {
  355. for (size_t i = haystack.length(); i > 0; --i) {
  356. auto value = StringUtils::find(haystack, needle, i - 1);
  357. if (value.has_value())
  358. return value;
  359. }
  360. return {};
  361. }
  362. Optional<size_t> find_last_not(StringView haystack, char needle)
  363. {
  364. for (size_t i = haystack.length(); i > 0; --i) {
  365. if (haystack[i - 1] != needle)
  366. return i - 1;
  367. }
  368. return {};
  369. }
  370. Vector<size_t> find_all(StringView haystack, StringView needle)
  371. {
  372. Vector<size_t> positions;
  373. size_t current_position = 0;
  374. while (current_position <= haystack.length()) {
  375. auto maybe_position = AK::memmem_optional(
  376. haystack.characters_without_null_termination() + current_position, haystack.length() - current_position,
  377. needle.characters_without_null_termination(), needle.length());
  378. if (!maybe_position.has_value())
  379. break;
  380. positions.append(current_position + *maybe_position);
  381. current_position += *maybe_position + 1;
  382. }
  383. return positions;
  384. }
  385. Optional<size_t> find_any_of(StringView haystack, StringView needles, SearchDirection direction)
  386. {
  387. if (haystack.is_empty() || needles.is_empty())
  388. return {};
  389. if (direction == SearchDirection::Forward) {
  390. for (size_t i = 0; i < haystack.length(); ++i) {
  391. if (needles.contains(haystack[i]))
  392. return i;
  393. }
  394. } else if (direction == SearchDirection::Backward) {
  395. for (size_t i = haystack.length(); i > 0; --i) {
  396. if (needles.contains(haystack[i - 1]))
  397. return i - 1;
  398. }
  399. }
  400. return {};
  401. }
  402. #ifndef KERNEL
  403. DeprecatedString to_snakecase(StringView str)
  404. {
  405. auto should_insert_underscore = [&](auto i, auto current_char) {
  406. if (i == 0)
  407. return false;
  408. auto previous_ch = str[i - 1];
  409. if (is_ascii_lower_alpha(previous_ch) && is_ascii_upper_alpha(current_char))
  410. return true;
  411. if (i >= str.length() - 1)
  412. return false;
  413. auto next_ch = str[i + 1];
  414. if (is_ascii_upper_alpha(current_char) && is_ascii_lower_alpha(next_ch))
  415. return true;
  416. return false;
  417. };
  418. StringBuilder builder;
  419. for (size_t i = 0; i < str.length(); ++i) {
  420. auto ch = str[i];
  421. if (should_insert_underscore(i, ch))
  422. builder.append('_');
  423. builder.append_as_lowercase(ch);
  424. }
  425. return builder.to_deprecated_string();
  426. }
  427. DeprecatedString to_titlecase(StringView str)
  428. {
  429. StringBuilder builder;
  430. bool next_is_upper = true;
  431. for (auto ch : str) {
  432. if (next_is_upper)
  433. builder.append(to_ascii_uppercase(ch));
  434. else
  435. builder.append(to_ascii_lowercase(ch));
  436. next_is_upper = ch == ' ';
  437. }
  438. return builder.to_deprecated_string();
  439. }
  440. DeprecatedString invert_case(StringView str)
  441. {
  442. StringBuilder builder(str.length());
  443. for (auto ch : str) {
  444. if (is_ascii_lower_alpha(ch))
  445. builder.append(to_ascii_uppercase(ch));
  446. else
  447. builder.append(to_ascii_lowercase(ch));
  448. }
  449. return builder.to_deprecated_string();
  450. }
  451. DeprecatedString replace(StringView str, StringView needle, StringView replacement, ReplaceMode replace_mode)
  452. {
  453. if (str.is_empty())
  454. return str;
  455. Vector<size_t> positions;
  456. if (replace_mode == ReplaceMode::All) {
  457. positions = str.find_all(needle);
  458. if (!positions.size())
  459. return str;
  460. } else {
  461. auto pos = str.find(needle);
  462. if (!pos.has_value())
  463. return str;
  464. positions.append(pos.value());
  465. }
  466. StringBuilder replaced_string;
  467. size_t last_position = 0;
  468. for (auto& position : positions) {
  469. replaced_string.append(str.substring_view(last_position, position - last_position));
  470. replaced_string.append(replacement);
  471. last_position = position + needle.length();
  472. }
  473. replaced_string.append(str.substring_view(last_position, str.length() - last_position));
  474. return replaced_string.to_deprecated_string();
  475. }
  476. ErrorOr<String> replace(String const& haystack, StringView needle, StringView replacement, ReplaceMode replace_mode)
  477. {
  478. if (haystack.is_empty())
  479. return haystack;
  480. // FIXME: Propagate Vector allocation failures (or do this without putting positions in a vector)
  481. Vector<size_t> positions;
  482. if (replace_mode == ReplaceMode::All) {
  483. positions = haystack.bytes_as_string_view().find_all(needle);
  484. if (!positions.size())
  485. return haystack;
  486. } else {
  487. auto pos = haystack.bytes_as_string_view().find(needle);
  488. if (!pos.has_value())
  489. return haystack;
  490. positions.append(pos.value());
  491. }
  492. StringBuilder replaced_string;
  493. size_t last_position = 0;
  494. for (auto& position : positions) {
  495. replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, position - last_position));
  496. replaced_string.append(replacement);
  497. last_position = position + needle.length();
  498. }
  499. replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, haystack.bytes_as_string_view().length() - last_position));
  500. return replaced_string.to_string();
  501. }
  502. #endif
  503. // TODO: Benchmark against KMP (AK/MemMem.h) and switch over if it's faster for short strings too
  504. size_t count(StringView str, StringView needle)
  505. {
  506. if (needle.is_empty())
  507. return str.length();
  508. size_t count = 0;
  509. for (size_t i = 0; i < str.length() - needle.length() + 1; ++i) {
  510. if (str.substring_view(i).starts_with(needle))
  511. count++;
  512. }
  513. return count;
  514. }
  515. }
  516. }