CharacterTypes.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf8View.h>
  11. #include <LibUnicode/CharacterTypes.h>
  12. #include <LibUnicode/Locale.h>
  13. #if ENABLE_UNICODE_DATA
  14. # include <LibUnicode/UnicodeData.h>
  15. #endif
  16. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  17. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  18. namespace Unicode {
  19. #if ENABLE_UNICODE_DATA
  20. static bool is_after_uppercase_i(Utf8View const& string, size_t index)
  21. {
  22. // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
  23. auto preceding_view = string.substring_view(0, index);
  24. bool found_uppercase_i = false;
  25. // FIXME: Would be better if Utf8View supported reverse iteration.
  26. for (auto code_point : preceding_view) {
  27. if (code_point == 'I') {
  28. found_uppercase_i = true;
  29. continue;
  30. }
  31. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  32. if (!unicode_data.has_value())
  33. return false;
  34. if (unicode_data->canonical_combining_class == 0)
  35. found_uppercase_i = false;
  36. else if (unicode_data->canonical_combining_class == 230)
  37. found_uppercase_i = false;
  38. }
  39. return found_uppercase_i;
  40. }
  41. static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
  42. {
  43. // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
  44. auto preceding_view = string.substring_view(0, index);
  45. bool found_soft_dotted_code_point = false;
  46. // FIXME: Would be better if Utf8View supported reverse iteration.
  47. for (auto code_point : preceding_view) {
  48. if (code_point_has_property(code_point, Property::Soft_Dotted)) {
  49. found_soft_dotted_code_point = true;
  50. continue;
  51. }
  52. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  53. if (!unicode_data.has_value())
  54. return false;
  55. if (unicode_data->canonical_combining_class == 0)
  56. found_soft_dotted_code_point = false;
  57. else if (unicode_data->canonical_combining_class == 230)
  58. found_soft_dotted_code_point = false;
  59. }
  60. return found_soft_dotted_code_point;
  61. }
  62. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  63. {
  64. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  65. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  66. // characters and then a cased letter.
  67. auto preceding_view = string.substring_view(0, index);
  68. auto following_view = ((index + byte_length) < string.byte_length())
  69. ? string.substring_view(index + byte_length)
  70. : Utf8View {};
  71. size_t cased_letter_count = 0;
  72. for (auto code_point : preceding_view) {
  73. bool is_cased = code_point_has_property(code_point, Property::Cased);
  74. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  75. if (is_cased && !is_case_ignorable)
  76. ++cased_letter_count;
  77. else if (!is_case_ignorable)
  78. cased_letter_count = 0;
  79. }
  80. if (cased_letter_count == 0)
  81. return false;
  82. for (auto code_point : following_view) {
  83. bool is_cased = code_point_has_property(code_point, Property::Cased);
  84. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  85. if (is_case_ignorable)
  86. continue;
  87. if (is_cased)
  88. return false;
  89. break;
  90. }
  91. return true;
  92. }
  93. static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
  94. {
  95. // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
  96. auto following_view = ((index + byte_length) < string.byte_length())
  97. ? string.substring_view(index + byte_length)
  98. : Utf8View {};
  99. for (auto code_point : following_view) {
  100. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  101. if (!unicode_data.has_value())
  102. return false;
  103. if (unicode_data->canonical_combining_class == 0)
  104. return false;
  105. if (unicode_data->canonical_combining_class == 230)
  106. return true;
  107. }
  108. return false;
  109. }
  110. static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
  111. {
  112. // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
  113. // intervene between the current character and the combining dot above.
  114. auto following_view = ((index + byte_length) < string.byte_length())
  115. ? string.substring_view(index + byte_length)
  116. : Utf8View {};
  117. for (auto code_point : following_view) {
  118. if (code_point == 0x307)
  119. return true;
  120. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  121. if (!unicode_data.has_value())
  122. return false;
  123. if (unicode_data->canonical_combining_class == 0)
  124. return false;
  125. if (unicode_data->canonical_combining_class == 230)
  126. return false;
  127. }
  128. return false;
  129. }
  130. static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  131. {
  132. auto requested_locale = Locale::None;
  133. if (locale.has_value()) {
  134. if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
  135. requested_locale = *maybe_locale;
  136. }
  137. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  138. auto const* special_casing = unicode_data.special_casing[i];
  139. if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
  140. continue;
  141. switch (special_casing->condition) {
  142. case Condition::None:
  143. return special_casing;
  144. case Condition::AfterI:
  145. if (is_after_uppercase_i(string, index))
  146. return special_casing;
  147. break;
  148. case Condition::AfterSoftDotted:
  149. if (is_after_soft_dotted_code_point(string, index))
  150. return special_casing;
  151. break;
  152. case Condition::FinalSigma:
  153. if (is_final_code_point(string, index, byte_length))
  154. return special_casing;
  155. break;
  156. case Condition::MoreAbove:
  157. if (is_followed_by_combining_class_above(string, index, byte_length))
  158. return special_casing;
  159. break;
  160. case Condition::NotBeforeDot:
  161. if (!is_followed_by_combining_dot_above(string, index, byte_length))
  162. return special_casing;
  163. break;
  164. }
  165. }
  166. return nullptr;
  167. }
  168. #endif
  169. u32 to_unicode_lowercase(u32 code_point)
  170. {
  171. #if ENABLE_UNICODE_DATA
  172. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  173. if (unicode_data.has_value())
  174. return unicode_data->simple_lowercase_mapping;
  175. return code_point;
  176. #else
  177. return AK::to_ascii_lowercase(code_point);
  178. #endif
  179. }
  180. u32 to_unicode_uppercase(u32 code_point)
  181. {
  182. #if ENABLE_UNICODE_DATA
  183. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  184. if (unicode_data.has_value())
  185. return unicode_data->simple_uppercase_mapping;
  186. return code_point;
  187. #else
  188. return AK::to_ascii_uppercase(code_point);
  189. #endif
  190. }
  191. String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
  192. {
  193. #if ENABLE_UNICODE_DATA
  194. Utf8View view { string };
  195. StringBuilder builder;
  196. size_t index = 0;
  197. size_t byte_length = 0;
  198. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  199. u32 code_point = *it;
  200. byte_length = it.underlying_code_point_length_in_bytes();
  201. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  202. if (!unicode_data.has_value()) {
  203. builder.append_code_point(code_point);
  204. continue;
  205. }
  206. auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
  207. if (!special_casing) {
  208. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  209. continue;
  210. }
  211. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  212. builder.append_code_point(special_casing->lowercase_mapping[i]);
  213. }
  214. return builder.build();
  215. #else
  216. return string.to_lowercase_string();
  217. #endif
  218. }
  219. String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
  220. {
  221. #if ENABLE_UNICODE_DATA
  222. Utf8View view { string };
  223. StringBuilder builder;
  224. size_t index = 0;
  225. size_t byte_length = 0;
  226. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  227. u32 code_point = *it;
  228. byte_length = it.underlying_code_point_length_in_bytes();
  229. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  230. if (!unicode_data.has_value()) {
  231. builder.append_code_point(code_point);
  232. continue;
  233. }
  234. auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
  235. if (!special_casing) {
  236. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  237. continue;
  238. }
  239. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  240. builder.append_code_point(special_casing->uppercase_mapping[i]);
  241. }
  242. return builder.build();
  243. #else
  244. return string.to_uppercase_string();
  245. #endif
  246. }
  247. Optional<GeneralCategory> general_category_from_string([[maybe_unused]] StringView const& general_category)
  248. {
  249. #if ENABLE_UNICODE_DATA
  250. return Detail::general_category_from_string(general_category);
  251. #else
  252. return {};
  253. #endif
  254. }
  255. bool code_point_has_general_category([[maybe_unused]] u32 code_point, [[maybe_unused]] GeneralCategory general_category)
  256. {
  257. #if ENABLE_UNICODE_DATA
  258. return Detail::code_point_has_general_category(code_point, general_category);
  259. #else
  260. return {};
  261. #endif
  262. }
  263. Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
  264. {
  265. #if ENABLE_UNICODE_DATA
  266. return Detail::property_from_string(property);
  267. #else
  268. return {};
  269. #endif
  270. }
  271. bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
  272. {
  273. #if ENABLE_UNICODE_DATA
  274. return Detail::code_point_has_property(code_point, property);
  275. #else
  276. return false;
  277. #endif
  278. }
  279. bool is_ecma262_property([[maybe_unused]] Property property)
  280. {
  281. #if ENABLE_UNICODE_DATA
  282. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  283. switch (property) {
  284. case Unicode::Property::ASCII:
  285. case Unicode::Property::ASCII_Hex_Digit:
  286. case Unicode::Property::Alphabetic:
  287. case Unicode::Property::Any:
  288. case Unicode::Property::Assigned:
  289. case Unicode::Property::Bidi_Control:
  290. case Unicode::Property::Bidi_Mirrored:
  291. case Unicode::Property::Case_Ignorable:
  292. case Unicode::Property::Cased:
  293. case Unicode::Property::Changes_When_Casefolded:
  294. case Unicode::Property::Changes_When_Casemapped:
  295. case Unicode::Property::Changes_When_Lowercased:
  296. case Unicode::Property::Changes_When_NFKC_Casefolded:
  297. case Unicode::Property::Changes_When_Titlecased:
  298. case Unicode::Property::Changes_When_Uppercased:
  299. case Unicode::Property::Dash:
  300. case Unicode::Property::Default_Ignorable_Code_Point:
  301. case Unicode::Property::Deprecated:
  302. case Unicode::Property::Diacritic:
  303. case Unicode::Property::Emoji:
  304. case Unicode::Property::Emoji_Component:
  305. case Unicode::Property::Emoji_Modifier:
  306. case Unicode::Property::Emoji_Modifier_Base:
  307. case Unicode::Property::Emoji_Presentation:
  308. case Unicode::Property::Extended_Pictographic:
  309. case Unicode::Property::Extender:
  310. case Unicode::Property::Grapheme_Base:
  311. case Unicode::Property::Grapheme_Extend:
  312. case Unicode::Property::Hex_Digit:
  313. case Unicode::Property::IDS_Binary_Operator:
  314. case Unicode::Property::IDS_Trinary_Operator:
  315. case Unicode::Property::ID_Continue:
  316. case Unicode::Property::ID_Start:
  317. case Unicode::Property::Ideographic:
  318. case Unicode::Property::Join_Control:
  319. case Unicode::Property::Logical_Order_Exception:
  320. case Unicode::Property::Lowercase:
  321. case Unicode::Property::Math:
  322. case Unicode::Property::Noncharacter_Code_Point:
  323. case Unicode::Property::Pattern_Syntax:
  324. case Unicode::Property::Pattern_White_Space:
  325. case Unicode::Property::Quotation_Mark:
  326. case Unicode::Property::Radical:
  327. case Unicode::Property::Regional_Indicator:
  328. case Unicode::Property::Sentence_Terminal:
  329. case Unicode::Property::Soft_Dotted:
  330. case Unicode::Property::Terminal_Punctuation:
  331. case Unicode::Property::Unified_Ideograph:
  332. case Unicode::Property::Uppercase:
  333. case Unicode::Property::Variation_Selector:
  334. case Unicode::Property::White_Space:
  335. case Unicode::Property::XID_Continue:
  336. case Unicode::Property::XID_Start:
  337. return true;
  338. default:
  339. return false;
  340. }
  341. #else
  342. return false;
  343. #endif
  344. }
  345. Optional<Script> script_from_string([[maybe_unused]] StringView const& script)
  346. {
  347. #if ENABLE_UNICODE_DATA
  348. return Detail::script_from_string(script);
  349. #else
  350. return {};
  351. #endif
  352. }
  353. bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  354. {
  355. #if ENABLE_UNICODE_DATA
  356. return Detail::code_point_has_script(code_point, script);
  357. #else
  358. return false;
  359. #endif
  360. }
  361. bool code_point_has_script_extension([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  362. {
  363. #if ENABLE_UNICODE_DATA
  364. return Detail::code_point_has_script_extension(code_point, script);
  365. #else
  366. return false;
  367. #endif
  368. }
  369. }