CharacterTypes.cpp 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf8View.h>
  11. #include <LibUnicode/CharacterTypes.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  16. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  17. namespace Unicode {
  18. #if ENABLE_UNICODE_DATA
  19. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  20. {
  21. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  22. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  23. // characters and then a cased letter.
  24. auto preceding_view = string.substring_view(0, index);
  25. auto following_view = ((index + byte_length) < string.byte_length())
  26. ? string.substring_view(index + byte_length)
  27. : Utf8View {};
  28. size_t cased_letter_count = 0;
  29. for (auto code_point : preceding_view) {
  30. bool is_cased = code_point_has_property(code_point, Property::Cased);
  31. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  32. if (is_cased && !is_case_ignorable)
  33. ++cased_letter_count;
  34. else if (!is_case_ignorable)
  35. cased_letter_count = 0;
  36. }
  37. if (cased_letter_count == 0)
  38. return false;
  39. for (auto code_point : following_view) {
  40. bool is_cased = code_point_has_property(code_point, Property::Cased);
  41. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  42. if (is_case_ignorable)
  43. continue;
  44. if (is_cased)
  45. return false;
  46. break;
  47. }
  48. return true;
  49. }
  50. static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  51. {
  52. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  53. auto const* special_casing = unicode_data.special_casing[i];
  54. if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
  55. return special_casing;
  56. // FIXME: Handle locale.
  57. if (special_casing->locale != Locale::None)
  58. continue;
  59. switch (special_casing->condition) {
  60. case Condition::FinalSigma:
  61. if (is_final_code_point(string, index, byte_length))
  62. return special_casing;
  63. break;
  64. default:
  65. break;
  66. }
  67. }
  68. return nullptr;
  69. }
  70. #endif
  71. u32 to_unicode_lowercase(u32 code_point)
  72. {
  73. #if ENABLE_UNICODE_DATA
  74. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  75. if (unicode_data.has_value())
  76. return unicode_data->simple_lowercase_mapping;
  77. return code_point;
  78. #else
  79. return AK::to_ascii_lowercase(code_point);
  80. #endif
  81. }
  82. u32 to_unicode_uppercase(u32 code_point)
  83. {
  84. #if ENABLE_UNICODE_DATA
  85. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  86. if (unicode_data.has_value())
  87. return unicode_data->simple_uppercase_mapping;
  88. return code_point;
  89. #else
  90. return AK::to_ascii_uppercase(code_point);
  91. #endif
  92. }
  93. String to_unicode_lowercase_full(StringView const& string)
  94. {
  95. #if ENABLE_UNICODE_DATA
  96. Utf8View view { string };
  97. StringBuilder builder;
  98. size_t index = 0;
  99. for (auto it = view.begin(); it != view.end(); ++it) {
  100. u32 code_point = *it;
  101. size_t byte_length = it.underlying_code_point_length_in_bytes();
  102. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  103. if (!unicode_data.has_value()) {
  104. builder.append_code_point(code_point);
  105. index += byte_length;
  106. continue;
  107. }
  108. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  109. if (!special_casing) {
  110. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  111. index += byte_length;
  112. continue;
  113. }
  114. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  115. builder.append_code_point(special_casing->lowercase_mapping[i]);
  116. }
  117. return builder.build();
  118. #else
  119. return string.to_lowercase_string();
  120. #endif
  121. }
  122. String to_unicode_uppercase_full(StringView const& string)
  123. {
  124. #if ENABLE_UNICODE_DATA
  125. Utf8View view { string };
  126. StringBuilder builder;
  127. size_t index = 0;
  128. for (auto it = view.begin(); it != view.end(); ++it) {
  129. u32 code_point = *it;
  130. size_t byte_length = it.underlying_code_point_length_in_bytes();
  131. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  132. if (!unicode_data.has_value()) {
  133. builder.append_code_point(code_point);
  134. index += byte_length;
  135. continue;
  136. }
  137. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  138. if (!special_casing) {
  139. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  140. index += byte_length;
  141. continue;
  142. }
  143. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  144. builder.append_code_point(special_casing->uppercase_mapping[i]);
  145. }
  146. return builder.build();
  147. #else
  148. return string.to_uppercase_string();
  149. #endif
  150. }
  151. Optional<GeneralCategory> general_category_from_string([[maybe_unused]] StringView const& general_category)
  152. {
  153. #if ENABLE_UNICODE_DATA
  154. return Detail::general_category_from_string(general_category);
  155. #else
  156. return {};
  157. #endif
  158. }
  159. bool code_point_has_general_category([[maybe_unused]] u32 code_point, [[maybe_unused]] GeneralCategory general_category)
  160. {
  161. #if ENABLE_UNICODE_DATA
  162. return Detail::code_point_has_general_category(code_point, general_category);
  163. #else
  164. return {};
  165. #endif
  166. }
  167. Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
  168. {
  169. #if ENABLE_UNICODE_DATA
  170. return Detail::property_from_string(property);
  171. #else
  172. return {};
  173. #endif
  174. }
  175. bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
  176. {
  177. #if ENABLE_UNICODE_DATA
  178. return Detail::code_point_has_property(code_point, property);
  179. #else
  180. return false;
  181. #endif
  182. }
  183. bool is_ecma262_property([[maybe_unused]] Property property)
  184. {
  185. #if ENABLE_UNICODE_DATA
  186. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  187. switch (property) {
  188. case Unicode::Property::ASCII:
  189. case Unicode::Property::ASCII_Hex_Digit:
  190. case Unicode::Property::Alphabetic:
  191. case Unicode::Property::Any:
  192. case Unicode::Property::Assigned:
  193. case Unicode::Property::Bidi_Control:
  194. case Unicode::Property::Bidi_Mirrored:
  195. case Unicode::Property::Case_Ignorable:
  196. case Unicode::Property::Cased:
  197. case Unicode::Property::Changes_When_Casefolded:
  198. case Unicode::Property::Changes_When_Casemapped:
  199. case Unicode::Property::Changes_When_Lowercased:
  200. case Unicode::Property::Changes_When_NFKC_Casefolded:
  201. case Unicode::Property::Changes_When_Titlecased:
  202. case Unicode::Property::Changes_When_Uppercased:
  203. case Unicode::Property::Dash:
  204. case Unicode::Property::Default_Ignorable_Code_Point:
  205. case Unicode::Property::Deprecated:
  206. case Unicode::Property::Diacritic:
  207. case Unicode::Property::Emoji:
  208. case Unicode::Property::Emoji_Component:
  209. case Unicode::Property::Emoji_Modifier:
  210. case Unicode::Property::Emoji_Modifier_Base:
  211. case Unicode::Property::Emoji_Presentation:
  212. case Unicode::Property::Extended_Pictographic:
  213. case Unicode::Property::Extender:
  214. case Unicode::Property::Grapheme_Base:
  215. case Unicode::Property::Grapheme_Extend:
  216. case Unicode::Property::Hex_Digit:
  217. case Unicode::Property::IDS_Binary_Operator:
  218. case Unicode::Property::IDS_Trinary_Operator:
  219. case Unicode::Property::ID_Continue:
  220. case Unicode::Property::ID_Start:
  221. case Unicode::Property::Ideographic:
  222. case Unicode::Property::Join_Control:
  223. case Unicode::Property::Logical_Order_Exception:
  224. case Unicode::Property::Lowercase:
  225. case Unicode::Property::Math:
  226. case Unicode::Property::Noncharacter_Code_Point:
  227. case Unicode::Property::Pattern_Syntax:
  228. case Unicode::Property::Pattern_White_Space:
  229. case Unicode::Property::Quotation_Mark:
  230. case Unicode::Property::Radical:
  231. case Unicode::Property::Regional_Indicator:
  232. case Unicode::Property::Sentence_Terminal:
  233. case Unicode::Property::Soft_Dotted:
  234. case Unicode::Property::Terminal_Punctuation:
  235. case Unicode::Property::Unified_Ideograph:
  236. case Unicode::Property::Uppercase:
  237. case Unicode::Property::Variation_Selector:
  238. case Unicode::Property::White_Space:
  239. case Unicode::Property::XID_Continue:
  240. case Unicode::Property::XID_Start:
  241. return true;
  242. default:
  243. return false;
  244. }
  245. #else
  246. return false;
  247. #endif
  248. }
  249. Optional<Script> script_from_string([[maybe_unused]] StringView const& script)
  250. {
  251. #if ENABLE_UNICODE_DATA
  252. return Detail::script_from_string(script);
  253. #else
  254. return {};
  255. #endif
  256. }
  257. bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  258. {
  259. #if ENABLE_UNICODE_DATA
  260. return Detail::code_point_has_script(code_point, script);
  261. #else
  262. return false;
  263. #endif
  264. }
  265. bool code_point_has_script_extension([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
  266. {
  267. #if ENABLE_UNICODE_DATA
  268. return Detail::code_point_has_script_extension(code_point, script);
  269. #else
  270. return false;
  271. #endif
  272. }
  273. }