UnicodeUtils.cpp 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /*
  2. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Platform.h>
  7. #include <AK/String.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/UnicodeUtils.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  16. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  17. namespace Unicode::Detail {
  18. #if ENABLE_UNICODE_DATA
  19. static bool is_after_uppercase_i(Utf8View const& string, size_t index)
  20. {
  21. // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
  22. auto preceding_view = string.substring_view(0, index);
  23. bool found_uppercase_i = false;
  24. // FIXME: Would be better if Utf8View supported reverse iteration.
  25. for (auto code_point : preceding_view) {
  26. if (code_point == 'I') {
  27. found_uppercase_i = true;
  28. continue;
  29. }
  30. auto combining_class = canonical_combining_class(code_point);
  31. if (combining_class == 0 || combining_class == 230)
  32. found_uppercase_i = false;
  33. }
  34. return found_uppercase_i;
  35. }
  36. static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
  37. {
  38. // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
  39. auto preceding_view = string.substring_view(0, index);
  40. bool found_soft_dotted_code_point = false;
  41. // FIXME: Would be better if Utf8View supported reverse iteration.
  42. for (auto code_point : preceding_view) {
  43. if (code_point_has_property(code_point, Property::Soft_Dotted)) {
  44. found_soft_dotted_code_point = true;
  45. continue;
  46. }
  47. auto combining_class = canonical_combining_class(code_point);
  48. if (combining_class == 0 || combining_class == 230)
  49. found_soft_dotted_code_point = false;
  50. }
  51. return found_soft_dotted_code_point;
  52. }
  53. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  54. {
  55. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  56. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  57. // characters and then a cased letter.
  58. auto preceding_view = string.substring_view(0, index);
  59. auto following_view = ((index + byte_length) < string.byte_length())
  60. ? string.substring_view(index + byte_length)
  61. : Utf8View {};
  62. size_t cased_letter_count = 0;
  63. for (auto code_point : preceding_view) {
  64. bool is_cased = code_point_has_property(code_point, Property::Cased);
  65. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  66. if (is_cased && !is_case_ignorable)
  67. ++cased_letter_count;
  68. else if (!is_case_ignorable)
  69. cased_letter_count = 0;
  70. }
  71. if (cased_letter_count == 0)
  72. return false;
  73. for (auto code_point : following_view) {
  74. bool is_cased = code_point_has_property(code_point, Property::Cased);
  75. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  76. if (is_case_ignorable)
  77. continue;
  78. if (is_cased)
  79. return false;
  80. break;
  81. }
  82. return true;
  83. }
  84. static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
  85. {
  86. // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
  87. auto following_view = ((index + byte_length) < string.byte_length())
  88. ? string.substring_view(index + byte_length)
  89. : Utf8View {};
  90. for (auto code_point : following_view) {
  91. u32 combining_class = canonical_combining_class(code_point);
  92. if (combining_class == 0)
  93. return false;
  94. if (combining_class == 230)
  95. return true;
  96. }
  97. return false;
  98. }
  99. static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
  100. {
  101. // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
  102. // intervene between the current character and the combining dot above.
  103. auto following_view = ((index + byte_length) < string.byte_length())
  104. ? string.substring_view(index + byte_length)
  105. : Utf8View {};
  106. for (auto code_point : following_view) {
  107. if (code_point == 0x307)
  108. return true;
  109. u32 combining_class = canonical_combining_class(code_point);
  110. if (combining_class == 0)
  111. return false;
  112. if (combining_class == 230)
  113. return false;
  114. }
  115. return false;
  116. }
  117. static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
  118. {
  119. auto requested_locale = Locale::None;
  120. if (locale.has_value()) {
  121. if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
  122. requested_locale = *maybe_locale;
  123. }
  124. auto special_casings = special_case_mapping(code_point);
  125. for (auto const* special_casing : special_casings) {
  126. if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
  127. continue;
  128. switch (special_casing->condition) {
  129. case Condition::None:
  130. return special_casing;
  131. case Condition::AfterI:
  132. if (is_after_uppercase_i(string, index))
  133. return special_casing;
  134. break;
  135. case Condition::AfterSoftDotted:
  136. if (is_after_soft_dotted_code_point(string, index))
  137. return special_casing;
  138. break;
  139. case Condition::FinalSigma:
  140. if (is_final_code_point(string, index, byte_length))
  141. return special_casing;
  142. break;
  143. case Condition::MoreAbove:
  144. if (is_followed_by_combining_class_above(string, index, byte_length))
  145. return special_casing;
  146. break;
  147. case Condition::NotBeforeDot:
  148. if (!is_followed_by_combining_dot_above(string, index, byte_length))
  149. return special_casing;
  150. break;
  151. }
  152. }
  153. return nullptr;
  154. }
  155. #endif
  156. ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
  157. {
  158. #if ENABLE_UNICODE_DATA
  159. size_t index = 0;
  160. size_t byte_length = 0;
  161. for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
  162. u32 code_point = *it;
  163. byte_length = it.underlying_code_point_length_in_bytes();
  164. auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
  165. if (!special_casing) {
  166. TRY(builder.try_append_code_point(to_unicode_lowercase(code_point)));
  167. continue;
  168. }
  169. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  170. TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i]));
  171. }
  172. return {};
  173. #else
  174. return Error::from_string_literal("Unicode data has been disabled");
  175. #endif
  176. }
  177. ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
  178. {
  179. #if ENABLE_UNICODE_DATA
  180. size_t index = 0;
  181. size_t byte_length = 0;
  182. for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
  183. u32 code_point = *it;
  184. byte_length = it.underlying_code_point_length_in_bytes();
  185. auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
  186. if (!special_casing) {
  187. TRY(builder.try_append_code_point(to_unicode_uppercase(code_point)));
  188. continue;
  189. }
  190. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  191. TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i]));
  192. }
  193. return {};
  194. #else
  195. return Error::from_string_literal("Unicode data has been disabled");
  196. #endif
  197. }
  198. }