CharacterTypes.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf8View.h>
  11. #include <LibUnicode/CharacterTypes.h>
  12. #include <LibUnicode/Locale.h>
  13. #if ENABLE_UNICODE_DATA
  14. # include <LibUnicode/UnicodeData.h>
  15. #endif
  16. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  17. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  18. namespace Unicode {
  19. #if ENABLE_UNICODE_DATA
  20. static bool is_after_uppercase_i(Utf8View const& string, size_t index)
  21. {
  22. // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
  23. auto preceding_view = string.substring_view(0, index);
  24. bool found_uppercase_i = false;
  25. // FIXME: Would be better if Utf8View supported reverse iteration.
  26. for (auto code_point : preceding_view) {
  27. if (code_point == 'I') {
  28. found_uppercase_i = true;
  29. continue;
  30. }
  31. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  32. if (!unicode_data.has_value())
  33. return false;
  34. if (unicode_data->canonical_combining_class == 0)
  35. found_uppercase_i = false;
  36. else if (unicode_data->canonical_combining_class == 230)
  37. found_uppercase_i = false;
  38. }
  39. return found_uppercase_i;
  40. }
  41. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  42. {
  43. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  44. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  45. // characters and then a cased letter.
  46. auto preceding_view = string.substring_view(0, index);
  47. auto following_view = ((index + byte_length) < string.byte_length())
  48. ? string.substring_view(index + byte_length)
  49. : Utf8View {};
  50. size_t cased_letter_count = 0;
  51. for (auto code_point : preceding_view) {
  52. bool is_cased = code_point_has_property(code_point, Property::Cased);
  53. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  54. if (is_cased && !is_case_ignorable)
  55. ++cased_letter_count;
  56. else if (!is_case_ignorable)
  57. cased_letter_count = 0;
  58. }
  59. if (cased_letter_count == 0)
  60. return false;
  61. for (auto code_point : following_view) {
  62. bool is_cased = code_point_has_property(code_point, Property::Cased);
  63. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  64. if (is_case_ignorable)
  65. continue;
  66. if (is_cased)
  67. return false;
  68. break;
  69. }
  70. return true;
  71. }
  72. static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  73. {
  74. auto requested_locale = Locale::None;
  75. if (locale.has_value()) {
  76. if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
  77. requested_locale = *maybe_locale;
  78. }
  79. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  80. auto const* special_casing = unicode_data.special_casing[i];
  81. if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
  82. continue;
  83. switch (special_casing->condition) {
  84. case Condition::None:
  85. return special_casing;
  86. case Condition::AfterI:
  87. if (is_after_uppercase_i(string, index))
  88. return special_casing;
  89. break;
  90. case Condition::FinalSigma:
  91. if (is_final_code_point(string, index, byte_length))
  92. return special_casing;
  93. break;
  94. default:
  95. break;
  96. }
  97. }
  98. return nullptr;
  99. }
  100. #endif
  101. u32 to_unicode_lowercase(u32 code_point)
  102. {
  103. #if ENABLE_UNICODE_DATA
  104. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  105. if (unicode_data.has_value())
  106. return unicode_data->simple_lowercase_mapping;
  107. return code_point;
  108. #else
  109. return AK::to_ascii_lowercase(code_point);
  110. #endif
  111. }
  112. u32 to_unicode_uppercase(u32 code_point)
  113. {
  114. #if ENABLE_UNICODE_DATA
  115. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  116. if (unicode_data.has_value())
  117. return unicode_data->simple_uppercase_mapping;
  118. return code_point;
  119. #else
  120. return AK::to_ascii_uppercase(code_point);
  121. #endif
  122. }
  123. String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
  124. {
  125. #if ENABLE_UNICODE_DATA
  126. Utf8View view { string };
  127. StringBuilder builder;
  128. size_t index = 0;
  129. size_t byte_length = 0;
  130. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  131. u32 code_point = *it;
  132. byte_length = it.underlying_code_point_length_in_bytes();
  133. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  134. if (!unicode_data.has_value()) {
  135. builder.append_code_point(code_point);
  136. continue;
  137. }
  138. auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
  139. if (!special_casing) {
  140. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  141. continue;
  142. }
  143. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  144. builder.append_code_point(special_casing->lowercase_mapping[i]);
  145. }
  146. return builder.build();
  147. #else
  148. return string.to_lowercase_string();
  149. #endif
  150. }
  151. String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
  152. {
  153. #if ENABLE_UNICODE_DATA
  154. Utf8View view { string };
  155. StringBuilder builder;
  156. size_t index = 0;
  157. size_t byte_length = 0;
  158. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  159. u32 code_point = *it;
  160. byte_length = it.underlying_code_point_length_in_bytes();
  161. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  162. if (!unicode_data.has_value()) {
  163. builder.append_code_point(code_point);
  164. continue;
  165. }
  166. auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
  167. if (!special_casing) {
  168. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  169. continue;
  170. }
  171. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  172. builder.append_code_point(special_casing->uppercase_mapping[i]);
  173. }
  174. return builder.build();
  175. #else
  176. return string.to_uppercase_string();
  177. #endif
  178. }
  179. Optional<GeneralCategory> general_category_from_string([[maybe_unused]] StringView const& general_category)
  180. {
  181. #if ENABLE_UNICODE_DATA
  182. return Detail::general_category_from_string(general_category);
  183. #else
  184. return {};
  185. #endif
  186. }
  187. bool code_point_has_general_category([[maybe_unused]] u32 code_point, [[maybe_unused]] GeneralCategory general_category)
  188. {
  189. #if ENABLE_UNICODE_DATA
  190. return Detail::code_point_has_general_category(code_point, general_category);
  191. #else
  192. return {};
  193. #endif
  194. }
  195. Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
  196. {
  197. #if ENABLE_UNICODE_DATA
  198. return Detail::property_from_string(property);
  199. #else
  200. return {};
  201. #endif
  202. }
  203. bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
  204. {
  205. #if ENABLE_UNICODE_DATA
  206. return Detail::code_point_has_property(code_point, property);
  207. #else
  208. return false;
  209. #endif
  210. }
  211. bool is_ecma262_property([[maybe_unused]] Property property)
  212. {
  213. #if ENABLE_UNICODE_DATA
  214. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  215. switch (property) {
  216. case Unicode::Property::ASCII:
  217. case Unicode::Property::ASCII_Hex_Digit:
  218. case Unicode::Property::Alphabetic:
  219. case Unicode::Property::Any:
  220. case Unicode::Property::Assigned:
  221. case Unicode::Property::Bidi_Control:
  222. case Unicode::Property::Bidi_Mirrored:
  223. case Unicode::Property::Case_Ignorable:
  224. case Unicode::Property::Cased:
  225. case Unicode::Property::Changes_When_Casefolded:
  226. case Unicode::Property::Changes_When_Casemapped:
  227. case Unicode::Property::Changes_When_Lowercased:
  228. case Unicode::Property::Changes_When_NFKC_Casefolded:
  229. case Unicode::Property::Changes_When_Titlecased:
  230. case Unicode::Property::Changes_When_Uppercased:
  231. case Unicode::Property::Dash:
  232. case Unicode::Property::Default_Ignorable_Code_Point:
  233. case Unicode::Property::Deprecated:
  234. case Unicode::Property::Diacritic:
  235. case Unicode::Property::Emoji:
  236. case Unicode::Property::Emoji_Component:
  237. case Unicode::Property::Emoji_Modifier:
  238. case Unicode::Property::Emoji_Modifier_Base:
  239. case Unicode::Property::Emoji_Presentation:
  240. case Unicode::Property::Extended_Pictographic:
  241. case Unicode::Property::Extender:
  242. case Unicode::Property::Grapheme_Base:
  243. case Unicode::Property::Grapheme_Extend:
  244. case Unicode::Property::Hex_Digit:
  245. case Unicode::Property::IDS_Binary_Operator:
  246. case Unicode::Property::IDS_Trinary_Operator:
  247. case Unicode::Property::ID_Continue:
  248. case Unicode::Property::ID_Start:
  249. case Unicode::Property::Ideographic:
  250. case Unicode::Property::Join_Control:
  251. case Unicode::Property::Logical_Order_Exception:
  252. case Unicode::Property::Lowercase:
  253. case Unicode::Property::Math:
  254. case Unicode::Property::Noncharacter_Code_Point:
  255. case Unicode::Property::Pattern_Syntax:
  256. case Unicode::Property::Pattern_White_Space:
  257. case Unicode::Property::Quotation_Mark:
  258. case Unicode::Property::Radical:
  259. case Unicode::Property::Regional_Indicator:
  260. case Unicode::Property::Sentence_Terminal:
  261. case Unicode::Property::Soft_Dotted:
  262. case Unicode::Property::Terminal_Punctuation:
  263. case Unicode::Property::Unified_Ideograph:
  264. case Unicode::Property::Uppercase:
  265. case Unicode::Property::Variation_Selector:
  266. case Unicode::Property::White_Space:
  267. case Unicode::Property::XID_Continue:
  268. case Unicode::Property::XID_Start:
  269. return true;
  270. default:
  271. return false;
  272. }
  273. #else
  274. return false;
  275. #endif
  276. }
  277. Optional<Script> script_from_string([[maybe_unused]] StringView const& script)
  278. {
  279. #if ENABLE_UNICODE_DATA
  280. return Detail::script_from_string(script);
  281. #else
  282. return {};
  283. #endif
  284. }
  285. bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  286. {
  287. #if ENABLE_UNICODE_DATA
  288. return Detail::code_point_has_script(code_point, script);
  289. #else
  290. return false;
  291. #endif
  292. }
  293. bool code_point_has_script_extension([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  294. {
  295. #if ENABLE_UNICODE_DATA
  296. return Detail::code_point_has_script_extension(code_point, script);
  297. #else
  298. return false;
  299. #endif
  300. }
  301. }