Strings.cpp 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. /*
  2. * Copyright (c) 2022-2023, Linus Groh <linusg@serenityos.org>
  3. * Copyright (c) 2022, networkException <networkexception@serenityos.org>
  4. * Copyright (c) 2023, Kenneth Myhra <kennethmyhra@serenityos.org>
  5. * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
  6. * Copyright (c) 2024, Andreas Kling <andreas@ladybird.org>
  7. *
  8. * SPDX-License-Identifier: BSD-2-Clause
  9. */
  10. #include <AK/CharacterTypes.h>
  11. #include <AK/FlyString.h>
  12. #include <AK/GenericLexer.h>
  13. #include <AK/String.h>
  14. #include <AK/Utf16View.h>
  15. #include <AK/Utf8View.h>
  16. #include <LibWeb/Infra/CharacterTypes.h>
  17. #include <LibWeb/Infra/Strings.h>
  18. namespace Web::Infra {
  19. // https://infra.spec.whatwg.org/#ascii-case-insensitive
  20. bool is_ascii_case_insensitive_match(StringView a, StringView b)
  21. {
  22. // A string A is an ASCII case-insensitive match for a string B,
  23. // if the ASCII lowercase of A is the ASCII lowercase of B.
  24. return AK::StringUtils::equals_ignoring_ascii_case(a, b);
  25. }
  26. // https://infra.spec.whatwg.org/#normalize-newlines
  27. String normalize_newlines(String const& string)
  28. {
  29. // To normalize newlines in a string, replace every U+000D CR U+000A LF code point pair with a single U+000A LF
  30. // code point, and then replace every remaining U+000D CR code point with a U+000A LF code point.
  31. if (!string.contains('\r'))
  32. return string;
  33. StringBuilder builder;
  34. GenericLexer lexer { string };
  35. while (!lexer.is_eof()) {
  36. builder.append(lexer.consume_until('\r'));
  37. if (lexer.peek() == '\r') {
  38. lexer.ignore(1 + static_cast<size_t>(lexer.peek(1) == '\n'));
  39. builder.append('\n');
  40. }
  41. }
  42. return MUST(builder.to_string());
  43. }
  44. // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace
  45. ErrorOr<String> strip_and_collapse_whitespace(StringView string)
  46. {
  47. // Replace any sequence of one or more consecutive code points that are ASCII whitespace in the string with a single U+0020 SPACE code point.
  48. StringBuilder builder;
  49. for (auto code_point : Utf8View { string }) {
  50. if (Infra::is_ascii_whitespace(code_point)) {
  51. if (!builder.string_view().ends_with(' '))
  52. builder.append(' ');
  53. continue;
  54. }
  55. TRY(builder.try_append_code_point(code_point));
  56. }
  57. // ...and then remove any leading and trailing ASCII whitespace from that string.
  58. return String::from_utf8(builder.string_view().trim(Infra::ASCII_WHITESPACE));
  59. }
  60. // https://infra.spec.whatwg.org/#code-unit-prefix
  61. bool is_code_unit_prefix(StringView potential_prefix, StringView input)
  62. {
  63. auto potential_prefix_utf16 = utf8_to_utf16(potential_prefix).release_value_but_fixme_should_propagate_errors();
  64. auto input_utf16 = utf8_to_utf16(input).release_value_but_fixme_should_propagate_errors();
  65. // 1. Let i be 0.
  66. size_t i = 0;
  67. // 2. While true:
  68. while (true) {
  69. // 1. If i is greater than or equal to potentialPrefix’s length, then return true.
  70. if (i >= potential_prefix.length())
  71. return true;
  72. // 2. If i is greater than or equal to input’s length, then return false.
  73. if (i >= input.length())
  74. return false;
  75. // 3. Let potentialPrefixCodeUnit be the ith code unit of potentialPrefix.
  76. auto potential_prefix_code_unit = Utf16View(potential_prefix_utf16).code_unit_at(i);
  77. // 4. Let inputCodeUnit be the ith code unit of input.
  78. auto input_code_unit = Utf16View(input_utf16).code_unit_at(i);
  79. // 5. Return false if potentialPrefixCodeUnit is not inputCodeUnit.
  80. if (potential_prefix_code_unit != input_code_unit)
  81. return false;
  82. // 6. Set i to i + 1.
  83. ++i;
  84. }
  85. }
  86. // https://infra.spec.whatwg.org/#scalar-value-string
  87. ErrorOr<String> convert_to_scalar_value_string(StringView string)
  88. {
  89. // To convert a string into a scalar value string, replace any surrogates with U+FFFD.
  90. StringBuilder scalar_value_builder;
  91. auto utf8_view = Utf8View { string };
  92. for (u32 code_point : utf8_view) {
  93. if (is_unicode_surrogate(code_point))
  94. code_point = 0xFFFD;
  95. scalar_value_builder.append_code_point(code_point);
  96. }
  97. return scalar_value_builder.to_string();
  98. }
  99. // https://infra.spec.whatwg.org/#ascii-lowercase
  100. ErrorOr<String> to_ascii_lowercase(StringView string)
  101. {
  102. // To ASCII lowercase a string, replace all ASCII upper alphas in the string with their
  103. // corresponding code point in ASCII lower alpha.
  104. StringBuilder string_builder;
  105. auto utf8_view = Utf8View { string };
  106. for (u32 code_point : utf8_view) {
  107. code_point = AK::to_ascii_lowercase(code_point);
  108. string_builder.append_code_point(code_point);
  109. }
  110. return string_builder.to_string();
  111. }
  112. // https://infra.spec.whatwg.org/#ascii-uppercase
  113. ErrorOr<String> to_ascii_uppercase(StringView string)
  114. {
  115. // To ASCII uppercase a string, replace all ASCII lower alphas in the string with their
  116. // corresponding code point in ASCII upper alpha.
  117. StringBuilder string_builder;
  118. auto utf8_view = Utf8View { string };
  119. for (u32 code_point : utf8_view) {
  120. code_point = AK::to_ascii_uppercase(code_point);
  121. string_builder.append_code_point(code_point);
  122. }
  123. return string_builder.to_string();
  124. }
  125. }