CharacterTypes.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf16View.h>
  11. #include <AK/Utf8View.h>
  12. #include <LibUnicode/CharacterTypes.h>
  13. #include <LibUnicode/Locale.h>
  14. #if ENABLE_UNICODE_DATA
  15. # include <LibUnicode/UnicodeData.h>
  16. #endif
  17. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  18. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  19. namespace Unicode {
  20. Optional<String> __attribute__((weak)) code_point_display_name(u32) { return {}; }
  21. Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
  22. u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
  23. Span<SpecialCasing const* const> __attribute__((weak)) special_case_mapping(u32) { return {}; }
  24. #if ENABLE_UNICODE_DATA
  25. static bool is_after_uppercase_i(Utf8View const& string, size_t index)
  26. {
  27. // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
  28. auto preceding_view = string.substring_view(0, index);
  29. bool found_uppercase_i = false;
  30. // FIXME: Would be better if Utf8View supported reverse iteration.
  31. for (auto code_point : preceding_view) {
  32. if (code_point == 'I') {
  33. found_uppercase_i = true;
  34. continue;
  35. }
  36. u32 combining_class = canonical_combining_class(code_point);
  37. if (combining_class == 0)
  38. found_uppercase_i = false;
  39. else if (combining_class == 230)
  40. found_uppercase_i = false;
  41. }
  42. return found_uppercase_i;
  43. }
  44. static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
  45. {
  46. // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
  47. auto preceding_view = string.substring_view(0, index);
  48. bool found_soft_dotted_code_point = false;
  49. // FIXME: Would be better if Utf8View supported reverse iteration.
  50. for (auto code_point : preceding_view) {
  51. if (code_point_has_property(code_point, Property::Soft_Dotted)) {
  52. found_soft_dotted_code_point = true;
  53. continue;
  54. }
  55. u32 combining_class = canonical_combining_class(code_point);
  56. if (combining_class == 0)
  57. found_soft_dotted_code_point = false;
  58. else if (combining_class == 230)
  59. found_soft_dotted_code_point = false;
  60. }
  61. return found_soft_dotted_code_point;
  62. }
  63. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  64. {
  65. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  66. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  67. // characters and then a cased letter.
  68. auto preceding_view = string.substring_view(0, index);
  69. auto following_view = ((index + byte_length) < string.byte_length())
  70. ? string.substring_view(index + byte_length)
  71. : Utf8View {};
  72. size_t cased_letter_count = 0;
  73. for (auto code_point : preceding_view) {
  74. bool is_cased = code_point_has_property(code_point, Property::Cased);
  75. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  76. if (is_cased && !is_case_ignorable)
  77. ++cased_letter_count;
  78. else if (!is_case_ignorable)
  79. cased_letter_count = 0;
  80. }
  81. if (cased_letter_count == 0)
  82. return false;
  83. for (auto code_point : following_view) {
  84. bool is_cased = code_point_has_property(code_point, Property::Cased);
  85. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  86. if (is_case_ignorable)
  87. continue;
  88. if (is_cased)
  89. return false;
  90. break;
  91. }
  92. return true;
  93. }
  94. static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
  95. {
  96. // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
  97. auto following_view = ((index + byte_length) < string.byte_length())
  98. ? string.substring_view(index + byte_length)
  99. : Utf8View {};
  100. for (auto code_point : following_view) {
  101. u32 combining_class = canonical_combining_class(code_point);
  102. if (combining_class == 0)
  103. return false;
  104. if (combining_class == 230)
  105. return true;
  106. }
  107. return false;
  108. }
  109. static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
  110. {
  111. // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
  112. // intervene between the current character and the combining dot above.
  113. auto following_view = ((index + byte_length) < string.byte_length())
  114. ? string.substring_view(index + byte_length)
  115. : Utf8View {};
  116. for (auto code_point : following_view) {
  117. if (code_point == 0x307)
  118. return true;
  119. u32 combining_class = canonical_combining_class(code_point);
  120. if (combining_class == 0)
  121. return false;
  122. if (combining_class == 230)
  123. return false;
  124. }
  125. return false;
  126. }
  127. static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
  128. {
  129. auto requested_locale = Locale::None;
  130. if (locale.has_value()) {
  131. if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
  132. requested_locale = *maybe_locale;
  133. }
  134. auto special_casings = special_case_mapping(code_point);
  135. for (auto const* special_casing : special_casings) {
  136. if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
  137. continue;
  138. switch (special_casing->condition) {
  139. case Condition::None:
  140. return special_casing;
  141. case Condition::AfterI:
  142. if (is_after_uppercase_i(string, index))
  143. return special_casing;
  144. break;
  145. case Condition::AfterSoftDotted:
  146. if (is_after_soft_dotted_code_point(string, index))
  147. return special_casing;
  148. break;
  149. case Condition::FinalSigma:
  150. if (is_final_code_point(string, index, byte_length))
  151. return special_casing;
  152. break;
  153. case Condition::MoreAbove:
  154. if (is_followed_by_combining_class_above(string, index, byte_length))
  155. return special_casing;
  156. break;
  157. case Condition::NotBeforeDot:
  158. if (!is_followed_by_combining_dot_above(string, index, byte_length))
  159. return special_casing;
  160. break;
  161. }
  162. }
  163. return nullptr;
  164. }
  165. #endif
  166. u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
  167. {
  168. return to_ascii_lowercase(code_point);
  169. }
  170. u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
  171. {
  172. return to_ascii_uppercase(code_point);
  173. }
  174. String to_unicode_lowercase_full(StringView string, [[maybe_unused]] Optional<StringView> locale)
  175. {
  176. #if ENABLE_UNICODE_DATA
  177. Utf8View view { string };
  178. StringBuilder builder;
  179. size_t index = 0;
  180. size_t byte_length = 0;
  181. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  182. u32 code_point = *it;
  183. byte_length = it.underlying_code_point_length_in_bytes();
  184. auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length);
  185. if (!special_casing) {
  186. builder.append_code_point(to_unicode_lowercase(code_point));
  187. continue;
  188. }
  189. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  190. builder.append_code_point(special_casing->lowercase_mapping[i]);
  191. }
  192. return builder.build();
  193. #else
  194. return string.to_lowercase_string();
  195. #endif
  196. }
  197. String to_unicode_uppercase_full(StringView string, [[maybe_unused]] Optional<StringView> locale)
  198. {
  199. #if ENABLE_UNICODE_DATA
  200. Utf8View view { string };
  201. StringBuilder builder;
  202. size_t index = 0;
  203. size_t byte_length = 0;
  204. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  205. u32 code_point = *it;
  206. byte_length = it.underlying_code_point_length_in_bytes();
  207. auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length);
  208. if (!special_casing) {
  209. builder.append_code_point(to_unicode_uppercase(code_point));
  210. continue;
  211. }
  212. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  213. builder.append_code_point(special_casing->uppercase_mapping[i]);
  214. }
  215. return builder.build();
  216. #else
  217. return string.to_uppercase_string();
  218. #endif
  219. }
  220. Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
  221. bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
  222. Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
  223. bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }
  224. bool is_ecma262_property([[maybe_unused]] Property property)
  225. {
  226. #if ENABLE_UNICODE_DATA
  227. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  228. switch (property) {
  229. case Unicode::Property::ASCII:
  230. case Unicode::Property::ASCII_Hex_Digit:
  231. case Unicode::Property::Alphabetic:
  232. case Unicode::Property::Any:
  233. case Unicode::Property::Assigned:
  234. case Unicode::Property::Bidi_Control:
  235. case Unicode::Property::Bidi_Mirrored:
  236. case Unicode::Property::Case_Ignorable:
  237. case Unicode::Property::Cased:
  238. case Unicode::Property::Changes_When_Casefolded:
  239. case Unicode::Property::Changes_When_Casemapped:
  240. case Unicode::Property::Changes_When_Lowercased:
  241. case Unicode::Property::Changes_When_NFKC_Casefolded:
  242. case Unicode::Property::Changes_When_Titlecased:
  243. case Unicode::Property::Changes_When_Uppercased:
  244. case Unicode::Property::Dash:
  245. case Unicode::Property::Default_Ignorable_Code_Point:
  246. case Unicode::Property::Deprecated:
  247. case Unicode::Property::Diacritic:
  248. case Unicode::Property::Emoji:
  249. case Unicode::Property::Emoji_Component:
  250. case Unicode::Property::Emoji_Modifier:
  251. case Unicode::Property::Emoji_Modifier_Base:
  252. case Unicode::Property::Emoji_Presentation:
  253. case Unicode::Property::Extended_Pictographic:
  254. case Unicode::Property::Extender:
  255. case Unicode::Property::Grapheme_Base:
  256. case Unicode::Property::Grapheme_Extend:
  257. case Unicode::Property::Hex_Digit:
  258. case Unicode::Property::IDS_Binary_Operator:
  259. case Unicode::Property::IDS_Trinary_Operator:
  260. case Unicode::Property::ID_Continue:
  261. case Unicode::Property::ID_Start:
  262. case Unicode::Property::Ideographic:
  263. case Unicode::Property::Join_Control:
  264. case Unicode::Property::Logical_Order_Exception:
  265. case Unicode::Property::Lowercase:
  266. case Unicode::Property::Math:
  267. case Unicode::Property::Noncharacter_Code_Point:
  268. case Unicode::Property::Pattern_Syntax:
  269. case Unicode::Property::Pattern_White_Space:
  270. case Unicode::Property::Quotation_Mark:
  271. case Unicode::Property::Radical:
  272. case Unicode::Property::Regional_Indicator:
  273. case Unicode::Property::Sentence_Terminal:
  274. case Unicode::Property::Soft_Dotted:
  275. case Unicode::Property::Terminal_Punctuation:
  276. case Unicode::Property::Unified_Ideograph:
  277. case Unicode::Property::Uppercase:
  278. case Unicode::Property::Variation_Selector:
  279. case Unicode::Property::White_Space:
  280. case Unicode::Property::XID_Continue:
  281. case Unicode::Property::XID_Start:
  282. return true;
  283. default:
  284. return false;
  285. }
  286. #else
  287. return false;
  288. #endif
  289. }
  290. Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
  291. bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
  292. bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
  293. bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
  294. bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
  295. bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
  296. Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  297. {
  298. #if ENABLE_UNICODE_DATA
  299. using GBP = GraphemeBreakProperty;
  300. Vector<size_t> boundaries;
  301. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  302. if (view.length_in_code_points() == 0)
  303. return boundaries;
  304. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  305. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  306. };
  307. // GB1
  308. boundaries.append(0);
  309. if (view.length_in_code_points() > 1) {
  310. auto it = view.begin();
  311. auto code_point = *it;
  312. u32 next_code_point;
  313. auto current_ri_chain = 0;
  314. auto in_emoji_sequence = false;
  315. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  316. next_code_point = *it;
  317. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  318. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  319. // GB3
  320. if (code_point_is_cr && next_code_point_is_lf)
  321. continue;
  322. // GB4, GB5
  323. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  324. boundaries.append(view.code_unit_offset_of(it));
  325. continue;
  326. }
  327. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  328. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  329. // GB6
  330. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  331. continue;
  332. // GB7
  333. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  334. continue;
  335. // GB8
  336. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  337. continue;
  338. auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
  339. if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
  340. in_emoji_sequence = true;
  341. else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
  342. in_emoji_sequence = false;
  343. // GB9
  344. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  345. continue;
  346. // GB9a
  347. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  348. continue;
  349. // GB9b
  350. if (has_any_gbp(code_point, GBP::Prepend))
  351. continue;
  352. // GB11
  353. if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  354. continue;
  355. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  356. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  357. // GB12, GB13
  358. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  359. continue;
  360. // GB999
  361. boundaries.append(view.code_unit_offset_of(it));
  362. }
  363. }
  364. // GB2
  365. boundaries.append(view.length_in_code_units());
  366. return boundaries;
  367. #else
  368. return {};
  369. #endif
  370. }
  371. }