UnicodeUtils.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Platform.h>
  7. #include <AK/String.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/Segmentation.h>
  12. #include <LibUnicode/UnicodeUtils.h>
  13. #if ENABLE_UNICODE_DATA
  14. # include <LibUnicode/UnicodeData.h>
  15. #endif
  16. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  17. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
  18. namespace Unicode::Detail {
  19. #if ENABLE_UNICODE_DATA
  20. static bool is_after_uppercase_i(Utf8View const& string, size_t index)
  21. {
  22. // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
  23. auto preceding_view = string.substring_view(0, index);
  24. bool found_uppercase_i = false;
  25. // FIXME: Would be better if Utf8View supported reverse iteration.
  26. for (auto code_point : preceding_view) {
  27. if (code_point == 'I') {
  28. found_uppercase_i = true;
  29. continue;
  30. }
  31. auto combining_class = canonical_combining_class(code_point);
  32. if (combining_class == 0 || combining_class == 230)
  33. found_uppercase_i = false;
  34. }
  35. return found_uppercase_i;
  36. }
  37. static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
  38. {
  39. // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
  40. auto preceding_view = string.substring_view(0, index);
  41. bool found_soft_dotted_code_point = false;
  42. // FIXME: Would be better if Utf8View supported reverse iteration.
  43. for (auto code_point : preceding_view) {
  44. if (code_point_has_property(code_point, Property::Soft_Dotted)) {
  45. found_soft_dotted_code_point = true;
  46. continue;
  47. }
  48. auto combining_class = canonical_combining_class(code_point);
  49. if (combining_class == 0 || combining_class == 230)
  50. found_soft_dotted_code_point = false;
  51. }
  52. return found_soft_dotted_code_point;
  53. }
  54. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  55. {
  56. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  57. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  58. // characters and then a cased letter.
  59. auto preceding_view = string.substring_view(0, index);
  60. auto following_view = ((index + byte_length) < string.byte_length())
  61. ? string.substring_view(index + byte_length)
  62. : Utf8View {};
  63. size_t cased_letter_count = 0;
  64. for (auto code_point : preceding_view) {
  65. bool is_cased = code_point_has_property(code_point, Property::Cased);
  66. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  67. if (is_cased && !is_case_ignorable)
  68. ++cased_letter_count;
  69. else if (!is_case_ignorable)
  70. cased_letter_count = 0;
  71. }
  72. if (cased_letter_count == 0)
  73. return false;
  74. for (auto code_point : following_view) {
  75. bool is_cased = code_point_has_property(code_point, Property::Cased);
  76. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  77. if (is_case_ignorable)
  78. continue;
  79. if (is_cased)
  80. return false;
  81. break;
  82. }
  83. return true;
  84. }
  85. static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
  86. {
  87. // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
  88. auto following_view = ((index + byte_length) < string.byte_length())
  89. ? string.substring_view(index + byte_length)
  90. : Utf8View {};
  91. for (auto code_point : following_view) {
  92. u32 combining_class = canonical_combining_class(code_point);
  93. if (combining_class == 0)
  94. return false;
  95. if (combining_class == 230)
  96. return true;
  97. }
  98. return false;
  99. }
  100. static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
  101. {
  102. // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
  103. // intervene between the current character and the combining dot above.
  104. auto following_view = ((index + byte_length) < string.byte_length())
  105. ? string.substring_view(index + byte_length)
  106. : Utf8View {};
  107. for (auto code_point : following_view) {
  108. if (code_point == 0x307)
  109. return true;
  110. u32 combining_class = canonical_combining_class(code_point);
  111. if (combining_class == 0)
  112. return false;
  113. if (combining_class == 230)
  114. return false;
  115. }
  116. return false;
  117. }
  118. static Optional<SpecialCasing const&> find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
  119. {
  120. auto requested_locale = Locale::None;
  121. if (locale.has_value()) {
  122. if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
  123. requested_locale = *maybe_locale;
  124. }
  125. auto special_casings = special_case_mapping(code_point);
  126. for (auto const& special_casing : special_casings) {
  127. if (special_casing.locale != Locale::None && special_casing.locale != requested_locale)
  128. continue;
  129. switch (special_casing.condition) {
  130. case Condition::None:
  131. return special_casing;
  132. case Condition::AfterI:
  133. if (is_after_uppercase_i(string, index))
  134. return special_casing;
  135. break;
  136. case Condition::AfterSoftDotted:
  137. if (is_after_soft_dotted_code_point(string, index))
  138. return special_casing;
  139. break;
  140. case Condition::FinalSigma:
  141. if (is_final_code_point(string, index, byte_length))
  142. return special_casing;
  143. break;
  144. case Condition::MoreAbove:
  145. if (is_followed_by_combining_class_above(string, index, byte_length))
  146. return special_casing;
  147. break;
  148. case Condition::NotBeforeDot:
  149. if (!is_followed_by_combining_dot_above(string, index, byte_length))
  150. return special_casing;
  151. break;
  152. }
  153. }
  154. return {};
  155. }
  156. template<CaseFoldingStatus... StatusFilter>
  157. static Optional<CaseFolding const&> find_matching_case_folding(u32 code_point)
  158. {
  159. auto case_foldings = case_folding_mapping(code_point);
  160. for (auto const& case_folding : case_foldings) {
  161. if (((case_folding.status == StatusFilter) || ...))
  162. return case_folding;
  163. }
  164. return {};
  165. }
  166. #endif
  167. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
  168. ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
  169. {
  170. #if ENABLE_UNICODE_DATA
  171. size_t index = 0;
  172. size_t byte_length = 0;
  173. for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
  174. u32 code_point = *it;
  175. byte_length = it.underlying_code_point_length_in_bytes();
  176. auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
  177. if (!special_casing.has_value()) {
  178. TRY(builder.try_append_code_point(to_unicode_lowercase(code_point)));
  179. continue;
  180. }
  181. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  182. TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i]));
  183. }
  184. return {};
  185. #else
  186. return Error::from_string_literal("Unicode data has been disabled");
  187. #endif
  188. }
  189. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
  190. ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
  191. {
  192. #if ENABLE_UNICODE_DATA
  193. size_t index = 0;
  194. size_t byte_length = 0;
  195. for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
  196. u32 code_point = *it;
  197. byte_length = it.underlying_code_point_length_in_bytes();
  198. auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
  199. if (!special_casing.has_value()) {
  200. TRY(builder.try_append_code_point(to_unicode_uppercase(code_point)));
  201. continue;
  202. }
  203. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  204. TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i]));
  205. }
  206. return {};
  207. #else
  208. return Error::from_string_literal("Unicode data has been disabled");
  209. #endif
  210. }
  211. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
  212. ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale, [[maybe_unused]] TrailingCodePointTransformation trailing_code_point_transformation)
  213. {
  214. #if ENABLE_UNICODE_DATA
  215. // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
  216. // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
  217. // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
  218. // F and the following word boundary to Lowercase_Mapping(C).
  219. auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
  220. auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
  221. auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
  222. for (; it != end; ++it) {
  223. if (code_point_has_property(*it, Property::Cased))
  224. return it;
  225. }
  226. return {};
  227. };
  228. auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
  229. auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
  230. if (!special_casing.has_value()) {
  231. TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
  232. return {};
  233. }
  234. for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
  235. TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
  236. return {};
  237. };
  238. size_t boundary = 0;
  239. while (true) {
  240. auto next_boundary = next_word_segmentation_boundary(code_points, boundary);
  241. if (!next_boundary.has_value())
  242. break;
  243. if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) {
  244. auto code_point = *it.value();
  245. auto code_point_offset = code_points.byte_offset_of(*it);
  246. auto code_point_length = it->underlying_code_point_length_in_bytes();
  247. auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
  248. TRY(builder.try_append(caseless_code_points.as_string()));
  249. TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
  250. boundary = code_point_offset + code_point_length;
  251. }
  252. auto remaining_code_points = code_points.substring_view(boundary, *next_boundary - boundary);
  253. switch (trailing_code_point_transformation) {
  254. case TrailingCodePointTransformation::Lowercase:
  255. TRY(build_lowercase_string(remaining_code_points, builder, locale));
  256. break;
  257. case TrailingCodePointTransformation::PreserveExisting:
  258. TRY(builder.try_append(remaining_code_points.as_string()));
  259. break;
  260. }
  261. boundary = *next_boundary;
  262. }
  263. return {};
  264. #else
  265. return Error::from_string_literal("Unicode data has been disabled");
  266. #endif
  267. }
  268. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
  269. ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder)
  270. {
  271. // toCasefold(X): Map each character C in X to Case_Folding(C).
  272. for (auto code_point : code_points) {
  273. auto case_folding = casefold_code_point(code_point);
  274. TRY(builder.try_append(case_folding));
  275. }
  276. return {};
  277. }
  278. // https://www.unicode.org/reports/tr44/#CaseFolding.txt
  279. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
  280. Utf32View casefold_code_point(u32 const& code_point)
  281. {
  282. #if ENABLE_UNICODE_DATA
  283. // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
  284. // CaseFolding.txt in the Unicode Character Database.
  285. using enum CaseFoldingStatus;
  286. if (auto case_folding = find_matching_case_folding<Common, Full>(code_point); case_folding.has_value())
  287. return Utf32View { case_folding->mapping, case_folding->mapping_size };
  288. #endif
  289. // The case foldings are omitted in the data file if they are the same as the code point itself.
  290. return Utf32View { &code_point, 1 };
  291. }
  292. }