CharacterTypes.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf8View.h>
  11. #include <LibUnicode/CharacterTypes.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  16. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  17. namespace Unicode {
  18. #if ENABLE_UNICODE_DATA
  19. static bool has_general_category(UnicodeData const& unicode_data, GeneralCategory general_category)
  20. {
  21. return (unicode_data.general_category & general_category) != GeneralCategory::None;
  22. }
  23. static bool has_property(UnicodeData const& unicode_data, Property property)
  24. {
  25. return (unicode_data.properties & property) == property;
  26. }
  27. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  28. {
  29. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  30. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  31. // characters and then a cased letter.
  32. auto preceding_view = string.substring_view(0, index);
  33. auto following_view = ((index + byte_length) < string.byte_length())
  34. ? string.substring_view(index + byte_length)
  35. : Utf8View {};
  36. size_t cased_letter_count = 0;
  37. for (auto code_point : preceding_view) {
  38. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  39. if (!unicode_data.has_value())
  40. return false;
  41. bool is_cased = has_property(*unicode_data, Property::Cased);
  42. bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
  43. if (is_cased && !is_case_ignorable)
  44. ++cased_letter_count;
  45. else if (!is_case_ignorable)
  46. cased_letter_count = 0;
  47. }
  48. if (cased_letter_count == 0)
  49. return false;
  50. for (auto code_point : following_view) {
  51. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  52. if (!unicode_data.has_value())
  53. return false;
  54. bool is_cased = has_property(*unicode_data, Property::Cased);
  55. bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
  56. if (is_case_ignorable)
  57. continue;
  58. if (is_cased)
  59. return false;
  60. break;
  61. }
  62. return true;
  63. }
  64. static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  65. {
  66. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  67. auto const* special_casing = unicode_data.special_casing[i];
  68. if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
  69. return special_casing;
  70. // FIXME: Handle locale.
  71. if (special_casing->locale != Locale::None)
  72. continue;
  73. switch (special_casing->condition) {
  74. case Condition::FinalSigma:
  75. if (is_final_code_point(string, index, byte_length))
  76. return special_casing;
  77. break;
  78. default:
  79. break;
  80. }
  81. }
  82. return nullptr;
  83. }
  84. #endif
  85. u32 to_unicode_lowercase(u32 code_point)
  86. {
  87. #if ENABLE_UNICODE_DATA
  88. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  89. if (unicode_data.has_value())
  90. return unicode_data->simple_lowercase_mapping;
  91. return code_point;
  92. #else
  93. return AK::to_ascii_lowercase(code_point);
  94. #endif
  95. }
  96. u32 to_unicode_uppercase(u32 code_point)
  97. {
  98. #if ENABLE_UNICODE_DATA
  99. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  100. if (unicode_data.has_value())
  101. return unicode_data->simple_uppercase_mapping;
  102. return code_point;
  103. #else
  104. return AK::to_ascii_uppercase(code_point);
  105. #endif
  106. }
  107. String to_unicode_lowercase_full(StringView const& string)
  108. {
  109. #if ENABLE_UNICODE_DATA
  110. Utf8View view { string };
  111. StringBuilder builder;
  112. size_t index = 0;
  113. for (auto it = view.begin(); it != view.end(); ++it) {
  114. u32 code_point = *it;
  115. size_t byte_length = it.underlying_code_point_length_in_bytes();
  116. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  117. if (!unicode_data.has_value()) {
  118. builder.append_code_point(code_point);
  119. index += byte_length;
  120. continue;
  121. }
  122. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  123. if (!special_casing) {
  124. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  125. index += byte_length;
  126. continue;
  127. }
  128. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  129. builder.append_code_point(special_casing->lowercase_mapping[i]);
  130. }
  131. return builder.build();
  132. #else
  133. return string.to_lowercase_string();
  134. #endif
  135. }
  136. String to_unicode_uppercase_full(StringView const& string)
  137. {
  138. #if ENABLE_UNICODE_DATA
  139. Utf8View view { string };
  140. StringBuilder builder;
  141. size_t index = 0;
  142. for (auto it = view.begin(); it != view.end(); ++it) {
  143. u32 code_point = *it;
  144. size_t byte_length = it.underlying_code_point_length_in_bytes();
  145. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  146. if (!unicode_data.has_value()) {
  147. builder.append_code_point(code_point);
  148. index += byte_length;
  149. continue;
  150. }
  151. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  152. if (!special_casing) {
  153. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  154. index += byte_length;
  155. continue;
  156. }
  157. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  158. builder.append_code_point(special_casing->uppercase_mapping[i]);
  159. }
  160. return builder.build();
  161. #else
  162. return string.to_uppercase_string();
  163. #endif
  164. }
  165. Optional<GeneralCategory> general_category_from_string([[maybe_unused]] StringView const& general_category)
  166. {
  167. #if ENABLE_UNICODE_DATA
  168. return Detail::general_category_from_string(general_category);
  169. #else
  170. return {};
  171. #endif
  172. }
  173. bool code_point_has_general_category([[maybe_unused]] u32 code_point, [[maybe_unused]] GeneralCategory general_category)
  174. {
  175. #if ENABLE_UNICODE_DATA
  176. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  177. if (!unicode_data.has_value())
  178. return false;
  179. return has_general_category(*unicode_data, general_category);
  180. #else
  181. return {};
  182. #endif
  183. }
  184. Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
  185. {
  186. #if ENABLE_UNICODE_DATA
  187. return Detail::property_from_string(property);
  188. #else
  189. return {};
  190. #endif
  191. }
  192. bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
  193. {
  194. #if ENABLE_UNICODE_DATA
  195. if (property == Property::Any)
  196. return is_unicode(code_point);
  197. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  198. if (!unicode_data.has_value())
  199. return false;
  200. return has_property(*unicode_data, property);
  201. #else
  202. return false;
  203. #endif
  204. }
  205. bool is_ecma262_property([[maybe_unused]] Property property)
  206. {
  207. #if ENABLE_UNICODE_DATA
  208. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  209. // Note: Some of the properties in the above link are not yet parsed by the LibUnicode generator. They are left
  210. // commented out here until they are parsed and can be used.
  211. switch (property) {
  212. case Unicode::Property::ASCII:
  213. case Unicode::Property::ASCII_Hex_Digit:
  214. case Unicode::Property::Alphabetic:
  215. case Unicode::Property::Any:
  216. case Unicode::Property::Assigned:
  217. case Unicode::Property::Bidi_Control:
  218. case Unicode::Property::Bidi_Mirrored:
  219. case Unicode::Property::Case_Ignorable:
  220. case Unicode::Property::Cased:
  221. case Unicode::Property::Changes_When_Casefolded:
  222. case Unicode::Property::Changes_When_Casemapped:
  223. case Unicode::Property::Changes_When_Lowercased:
  224. // case Unicode::Property::Changes_When_NFKC_Casefolded:
  225. case Unicode::Property::Changes_When_Titlecased:
  226. case Unicode::Property::Changes_When_Uppercased:
  227. case Unicode::Property::Dash:
  228. case Unicode::Property::Default_Ignorable_Code_Point:
  229. case Unicode::Property::Deprecated:
  230. case Unicode::Property::Diacritic:
  231. case Unicode::Property::Emoji:
  232. case Unicode::Property::Emoji_Component:
  233. case Unicode::Property::Emoji_Modifier:
  234. case Unicode::Property::Emoji_Modifier_Base:
  235. case Unicode::Property::Emoji_Presentation:
  236. case Unicode::Property::Extended_Pictographic:
  237. case Unicode::Property::Extender:
  238. case Unicode::Property::Grapheme_Base:
  239. case Unicode::Property::Grapheme_Extend:
  240. case Unicode::Property::Hex_Digit:
  241. case Unicode::Property::IDS_Binary_Operator:
  242. case Unicode::Property::IDS_Trinary_Operator:
  243. case Unicode::Property::ID_Continue:
  244. case Unicode::Property::ID_Start:
  245. case Unicode::Property::Ideographic:
  246. case Unicode::Property::Join_Control:
  247. case Unicode::Property::Logical_Order_Exception:
  248. case Unicode::Property::Lowercase:
  249. case Unicode::Property::Math:
  250. case Unicode::Property::Noncharacter_Code_Point:
  251. case Unicode::Property::Pattern_Syntax:
  252. case Unicode::Property::Pattern_White_Space:
  253. case Unicode::Property::Quotation_Mark:
  254. case Unicode::Property::Radical:
  255. case Unicode::Property::Regional_Indicator:
  256. case Unicode::Property::Sentence_Terminal:
  257. case Unicode::Property::Soft_Dotted:
  258. case Unicode::Property::Terminal_Punctuation:
  259. case Unicode::Property::Unified_Ideograph:
  260. case Unicode::Property::Uppercase:
  261. case Unicode::Property::Variation_Selector:
  262. case Unicode::Property::White_Space:
  263. case Unicode::Property::XID_Continue:
  264. case Unicode::Property::XID_Start:
  265. return true;
  266. default:
  267. return false;
  268. }
  269. #else
  270. return false;
  271. #endif
  272. }
  273. Optional<Script> script_from_string([[maybe_unused]] StringView const& script)
  274. {
  275. #if ENABLE_UNICODE_DATA
  276. return Detail::script_from_string(script);
  277. #else
  278. return {};
  279. #endif
  280. }
  281. bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  282. {
  283. #if ENABLE_UNICODE_DATA
  284. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  285. if (!unicode_data.has_value())
  286. return false;
  287. return unicode_data->script == script;
  288. #else
  289. return false;
  290. #endif
  291. }
  292. bool code_point_has_script_extension([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  293. {
  294. #if ENABLE_UNICODE_DATA
  295. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  296. if (!unicode_data.has_value())
  297. return false;
  298. if (unicode_data->script == script)
  299. return true;
  300. for (u32 i = 0; i < unicode_data->script_extensions_size; ++i) {
  301. if (unicode_data->script_extensions[i] == script)
  302. return true;
  303. }
  304. return false;
  305. #else
  306. return false;
  307. #endif
  308. }
  309. }