StringUtils.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <awesomekling@gmail.com>
  3. * Copyright (c) 2020, Fei Wu <f.eiwu@yahoo.com>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/MemMem.h>
  9. #include <AK/Memory.h>
  10. #include <AK/Optional.h>
  11. #include <AK/String.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/StringUtils.h>
  14. #include <AK/StringView.h>
  15. #include <AK/Vector.h>
  16. namespace AK {
  17. namespace StringUtils {
  18. bool matches(StringView str, StringView mask, CaseSensitivity case_sensitivity, Vector<MaskSpan>* match_spans)
  19. {
  20. auto record_span = [&match_spans](size_t start, size_t length) {
  21. if (match_spans)
  22. match_spans->append({ start, length });
  23. };
  24. if (str.is_null() || mask.is_null())
  25. return str.is_null() && mask.is_null();
  26. if (mask == "*"sv) {
  27. record_span(0, str.length());
  28. return true;
  29. }
  30. const char* string_ptr = str.characters_without_null_termination();
  31. const char* string_start = str.characters_without_null_termination();
  32. const char* string_end = string_ptr + str.length();
  33. const char* mask_ptr = mask.characters_without_null_termination();
  34. const char* mask_end = mask_ptr + mask.length();
  35. while (string_ptr < string_end && mask_ptr < mask_end) {
  36. auto string_start_ptr = string_ptr;
  37. switch (*mask_ptr) {
  38. case '*':
  39. if (mask_ptr == mask_end - 1) {
  40. record_span(string_ptr - string_start, string_end - string_ptr);
  41. return true;
  42. }
  43. while (string_ptr < string_end && !matches({ string_ptr, static_cast<size_t>(string_end - string_ptr) }, { mask_ptr + 1, static_cast<size_t>(mask_end - mask_ptr - 1) }, case_sensitivity))
  44. ++string_ptr;
  45. record_span(string_start_ptr - string_start, string_ptr - string_start_ptr);
  46. --string_ptr;
  47. break;
  48. case '?':
  49. record_span(string_ptr - string_start, 1);
  50. break;
  51. default:
  52. auto p = *mask_ptr;
  53. auto ch = *string_ptr;
  54. if (case_sensitivity == CaseSensitivity::CaseSensitive ? p != ch : to_ascii_lowercase(p) != to_ascii_lowercase(ch))
  55. return false;
  56. break;
  57. }
  58. ++string_ptr;
  59. ++mask_ptr;
  60. }
  61. if (string_ptr == string_end) {
  62. // Allow ending '*' to contain nothing.
  63. while (mask_ptr != mask_end && *mask_ptr == '*') {
  64. record_span(string_ptr - string_start, 0);
  65. ++mask_ptr;
  66. }
  67. }
  68. return string_ptr == string_end && mask_ptr == mask_end;
  69. }
  70. template<typename T>
  71. Optional<T> convert_to_int(StringView str, TrimWhitespace trim_whitespace)
  72. {
  73. auto string = trim_whitespace == TrimWhitespace::Yes
  74. ? str.trim_whitespace()
  75. : str;
  76. if (string.is_empty())
  77. return {};
  78. T sign = 1;
  79. size_t i = 0;
  80. const auto characters = string.characters_without_null_termination();
  81. if (characters[0] == '-' || characters[0] == '+') {
  82. if (string.length() == 1)
  83. return {};
  84. i++;
  85. if (characters[0] == '-')
  86. sign = -1;
  87. }
  88. T value = 0;
  89. for (; i < string.length(); i++) {
  90. if (characters[i] < '0' || characters[i] > '9')
  91. return {};
  92. if (__builtin_mul_overflow(value, 10, &value))
  93. return {};
  94. if (__builtin_add_overflow(value, sign * (characters[i] - '0'), &value))
  95. return {};
  96. }
  97. return value;
  98. }
  99. template Optional<i8> convert_to_int(StringView str, TrimWhitespace);
  100. template Optional<i16> convert_to_int(StringView str, TrimWhitespace);
  101. template Optional<i32> convert_to_int(StringView str, TrimWhitespace);
  102. template Optional<long> convert_to_int(StringView str, TrimWhitespace);
  103. template Optional<long long> convert_to_int(StringView str, TrimWhitespace);
  104. template<typename T>
  105. Optional<T> convert_to_uint(StringView str, TrimWhitespace trim_whitespace)
  106. {
  107. auto string = trim_whitespace == TrimWhitespace::Yes
  108. ? str.trim_whitespace()
  109. : str;
  110. if (string.is_empty())
  111. return {};
  112. T value = 0;
  113. const auto characters = string.characters_without_null_termination();
  114. for (size_t i = 0; i < string.length(); i++) {
  115. if (characters[i] < '0' || characters[i] > '9')
  116. return {};
  117. if (__builtin_mul_overflow(value, 10, &value))
  118. return {};
  119. if (__builtin_add_overflow(value, characters[i] - '0', &value))
  120. return {};
  121. }
  122. return value;
  123. }
  124. template Optional<u8> convert_to_uint(StringView str, TrimWhitespace);
  125. template Optional<u16> convert_to_uint(StringView str, TrimWhitespace);
  126. template Optional<u32> convert_to_uint(StringView str, TrimWhitespace);
  127. template Optional<unsigned long> convert_to_uint(StringView str, TrimWhitespace);
  128. template Optional<unsigned long long> convert_to_uint(StringView str, TrimWhitespace);
  129. template Optional<long> convert_to_uint(StringView str, TrimWhitespace);
  130. template Optional<long long> convert_to_uint(StringView str, TrimWhitespace);
  131. template<typename T>
  132. Optional<T> convert_to_uint_from_hex(StringView str, TrimWhitespace trim_whitespace)
  133. {
  134. auto string = trim_whitespace == TrimWhitespace::Yes
  135. ? str.trim_whitespace()
  136. : str;
  137. if (string.is_empty())
  138. return {};
  139. T value = 0;
  140. const auto count = string.length();
  141. const T upper_bound = NumericLimits<T>::max();
  142. for (size_t i = 0; i < count; i++) {
  143. char digit = string[i];
  144. u8 digit_val;
  145. if (value > (upper_bound >> 4))
  146. return {};
  147. if (digit >= '0' && digit <= '9') {
  148. digit_val = digit - '0';
  149. } else if (digit >= 'a' && digit <= 'f') {
  150. digit_val = 10 + (digit - 'a');
  151. } else if (digit >= 'A' && digit <= 'F') {
  152. digit_val = 10 + (digit - 'A');
  153. } else {
  154. return {};
  155. }
  156. value = (value << 4) + digit_val;
  157. }
  158. return value;
  159. }
  160. template Optional<u8> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  161. template Optional<u16> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  162. template Optional<u32> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  163. template Optional<u64> convert_to_uint_from_hex(StringView str, TrimWhitespace);
  164. template<typename T>
  165. Optional<T> convert_to_uint_from_octal(StringView str, TrimWhitespace trim_whitespace)
  166. {
  167. auto string = trim_whitespace == TrimWhitespace::Yes
  168. ? str.trim_whitespace()
  169. : str;
  170. if (string.is_empty())
  171. return {};
  172. T value = 0;
  173. const auto count = string.length();
  174. const T upper_bound = NumericLimits<T>::max();
  175. for (size_t i = 0; i < count; i++) {
  176. char digit = string[i];
  177. u8 digit_val;
  178. if (value > (upper_bound >> 3))
  179. return {};
  180. if (digit >= '0' && digit <= '7') {
  181. digit_val = digit - '0';
  182. } else {
  183. return {};
  184. }
  185. value = (value << 3) + digit_val;
  186. }
  187. return value;
  188. }
  189. template Optional<u8> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  190. template Optional<u16> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  191. template Optional<u32> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  192. template Optional<u64> convert_to_uint_from_octal(StringView str, TrimWhitespace);
  193. bool equals_ignoring_case(StringView a, StringView b)
  194. {
  195. if (a.length() != b.length())
  196. return false;
  197. for (size_t i = 0; i < a.length(); ++i) {
  198. if (to_ascii_lowercase(a.characters_without_null_termination()[i]) != to_ascii_lowercase(b.characters_without_null_termination()[i]))
  199. return false;
  200. }
  201. return true;
  202. }
  203. bool ends_with(StringView str, StringView end, CaseSensitivity case_sensitivity)
  204. {
  205. if (end.is_empty())
  206. return true;
  207. if (str.is_empty())
  208. return false;
  209. if (end.length() > str.length())
  210. return false;
  211. if (case_sensitivity == CaseSensitivity::CaseSensitive)
  212. return !memcmp(str.characters_without_null_termination() + (str.length() - end.length()), end.characters_without_null_termination(), end.length());
  213. auto str_chars = str.characters_without_null_termination();
  214. auto end_chars = end.characters_without_null_termination();
  215. size_t si = str.length() - end.length();
  216. for (size_t ei = 0; ei < end.length(); ++si, ++ei) {
  217. if (to_ascii_lowercase(str_chars[si]) != to_ascii_lowercase(end_chars[ei]))
  218. return false;
  219. }
  220. return true;
  221. }
  222. bool starts_with(StringView str, StringView start, CaseSensitivity case_sensitivity)
  223. {
  224. if (start.is_empty())
  225. return true;
  226. if (str.is_empty())
  227. return false;
  228. if (start.length() > str.length())
  229. return false;
  230. if (str.characters_without_null_termination() == start.characters_without_null_termination())
  231. return true;
  232. if (case_sensitivity == CaseSensitivity::CaseSensitive)
  233. return !memcmp(str.characters_without_null_termination(), start.characters_without_null_termination(), start.length());
  234. auto str_chars = str.characters_without_null_termination();
  235. auto start_chars = start.characters_without_null_termination();
  236. size_t si = 0;
  237. for (size_t starti = 0; starti < start.length(); ++si, ++starti) {
  238. if (to_ascii_lowercase(str_chars[si]) != to_ascii_lowercase(start_chars[starti]))
  239. return false;
  240. }
  241. return true;
  242. }
  243. bool contains(StringView str, StringView needle, CaseSensitivity case_sensitivity)
  244. {
  245. if (str.is_null() || needle.is_null() || str.is_empty() || needle.length() > str.length())
  246. return false;
  247. if (needle.is_empty())
  248. return true;
  249. auto str_chars = str.characters_without_null_termination();
  250. auto needle_chars = needle.characters_without_null_termination();
  251. if (case_sensitivity == CaseSensitivity::CaseSensitive)
  252. return memmem(str_chars, str.length(), needle_chars, needle.length()) != nullptr;
  253. auto needle_first = to_ascii_lowercase(needle_chars[0]);
  254. for (size_t si = 0; si < str.length(); si++) {
  255. if (to_ascii_lowercase(str_chars[si]) != needle_first)
  256. continue;
  257. for (size_t ni = 0; si + ni < str.length(); ni++) {
  258. if (to_ascii_lowercase(str_chars[si + ni]) != to_ascii_lowercase(needle_chars[ni])) {
  259. si += ni;
  260. break;
  261. }
  262. if (ni + 1 == needle.length())
  263. return true;
  264. }
  265. }
  266. return false;
  267. }
  268. bool is_whitespace(StringView str)
  269. {
  270. return all_of(str, is_ascii_space);
  271. }
  272. StringView trim(StringView str, StringView characters, TrimMode mode)
  273. {
  274. size_t substring_start = 0;
  275. size_t substring_length = str.length();
  276. if (mode == TrimMode::Left || mode == TrimMode::Both) {
  277. for (size_t i = 0; i < str.length(); ++i) {
  278. if (substring_length == 0)
  279. return "";
  280. if (!characters.contains(str[i]))
  281. break;
  282. ++substring_start;
  283. --substring_length;
  284. }
  285. }
  286. if (mode == TrimMode::Right || mode == TrimMode::Both) {
  287. for (size_t i = str.length() - 1; i > 0; --i) {
  288. if (substring_length == 0)
  289. return "";
  290. if (!characters.contains(str[i]))
  291. break;
  292. --substring_length;
  293. }
  294. }
  295. return str.substring_view(substring_start, substring_length);
  296. }
  297. StringView trim_whitespace(StringView str, TrimMode mode)
  298. {
  299. return trim(str, " \n\t\v\f\r", mode);
  300. }
  301. Optional<size_t> find(StringView haystack, char needle, size_t start)
  302. {
  303. if (start >= haystack.length())
  304. return {};
  305. for (size_t i = start; i < haystack.length(); ++i) {
  306. if (haystack[i] == needle)
  307. return i;
  308. }
  309. return {};
  310. }
  311. Optional<size_t> find(StringView haystack, StringView needle, size_t start)
  312. {
  313. if (start > haystack.length())
  314. return {};
  315. auto index = AK::memmem_optional(
  316. haystack.characters_without_null_termination() + start, haystack.length() - start,
  317. needle.characters_without_null_termination(), needle.length());
  318. return index.has_value() ? (*index + start) : index;
  319. }
  320. Optional<size_t> find_last(StringView haystack, char needle)
  321. {
  322. for (size_t i = haystack.length(); i > 0; --i) {
  323. if (haystack[i - 1] == needle)
  324. return i - 1;
  325. }
  326. return {};
  327. }
  328. Vector<size_t> find_all(StringView haystack, StringView needle)
  329. {
  330. Vector<size_t> positions;
  331. size_t current_position = 0;
  332. while (current_position <= haystack.length()) {
  333. auto maybe_position = AK::memmem_optional(
  334. haystack.characters_without_null_termination() + current_position, haystack.length() - current_position,
  335. needle.characters_without_null_termination(), needle.length());
  336. if (!maybe_position.has_value())
  337. break;
  338. positions.append(current_position + *maybe_position);
  339. current_position += *maybe_position + 1;
  340. }
  341. return positions;
  342. }
  343. Optional<size_t> find_any_of(StringView haystack, StringView needles, SearchDirection direction)
  344. {
  345. if (haystack.is_empty() || needles.is_empty())
  346. return {};
  347. if (direction == SearchDirection::Forward) {
  348. for (size_t i = 0; i < haystack.length(); ++i) {
  349. if (needles.contains(haystack[i]))
  350. return i;
  351. }
  352. } else if (direction == SearchDirection::Backward) {
  353. for (size_t i = haystack.length(); i > 0; --i) {
  354. if (needles.contains(haystack[i - 1]))
  355. return i - 1;
  356. }
  357. }
  358. return {};
  359. }
  360. String to_snakecase(StringView str)
  361. {
  362. auto should_insert_underscore = [&](auto i, auto current_char) {
  363. if (i == 0)
  364. return false;
  365. auto previous_ch = str[i - 1];
  366. if (is_ascii_lower_alpha(previous_ch) && is_ascii_upper_alpha(current_char))
  367. return true;
  368. if (i >= str.length() - 1)
  369. return false;
  370. auto next_ch = str[i + 1];
  371. if (is_ascii_upper_alpha(current_char) && is_ascii_lower_alpha(next_ch))
  372. return true;
  373. return false;
  374. };
  375. StringBuilder builder;
  376. for (size_t i = 0; i < str.length(); ++i) {
  377. auto ch = str[i];
  378. if (should_insert_underscore(i, ch))
  379. builder.append('_');
  380. builder.append_as_lowercase(ch);
  381. }
  382. return builder.to_string();
  383. }
  384. String to_titlecase(StringView str)
  385. {
  386. StringBuilder builder;
  387. bool next_is_upper = true;
  388. for (auto ch : str) {
  389. if (next_is_upper)
  390. builder.append_code_point(to_ascii_uppercase(ch));
  391. else
  392. builder.append_code_point(to_ascii_lowercase(ch));
  393. next_is_upper = ch == ' ';
  394. }
  395. return builder.to_string();
  396. }
  397. String replace(StringView str, StringView needle, StringView replacement, bool all_occurrences)
  398. {
  399. if (str.is_empty())
  400. return str;
  401. Vector<size_t> positions;
  402. if (all_occurrences) {
  403. positions = str.find_all(needle);
  404. if (!positions.size())
  405. return str;
  406. } else {
  407. auto pos = str.find(needle);
  408. if (!pos.has_value())
  409. return str;
  410. positions.append(pos.value());
  411. }
  412. StringBuilder replaced_string;
  413. size_t last_position = 0;
  414. for (auto& position : positions) {
  415. replaced_string.append(str.substring_view(last_position, position - last_position));
  416. replaced_string.append(replacement);
  417. last_position = position + needle.length();
  418. }
  419. replaced_string.append(str.substring_view(last_position, str.length() - last_position));
  420. return replaced_string.build();
  421. }
  422. // TODO: Benchmark against KMP (AK/MemMem.h) and switch over if it's faster for short strings too
  423. size_t count(StringView str, StringView needle)
  424. {
  425. if (needle.is_empty())
  426. return str.length();
  427. size_t count = 0;
  428. for (size_t i = 0; i < str.length() - needle.length() + 1; ++i) {
  429. if (str.substring_view(i).starts_with(needle))
  430. count++;
  431. }
  432. return count;
  433. }
  434. }
  435. }