CharacterTypes.cpp 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf8View.h>
  11. #include <LibUnicode/CharacterTypes.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  16. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  17. namespace Unicode {
  18. #if ENABLE_UNICODE_DATA
  19. static bool has_property(UnicodeData const& unicode_data, Property property)
  20. {
  21. return (unicode_data.properties & property) == property;
  22. }
  23. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  24. {
  25. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  26. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  27. // characters and then a cased letter.
  28. auto preceding_view = string.substring_view(0, index);
  29. auto following_view = ((index + byte_length) < string.byte_length())
  30. ? string.substring_view(index + byte_length)
  31. : Utf8View {};
  32. size_t cased_letter_count = 0;
  33. for (auto code_point : preceding_view) {
  34. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  35. if (!unicode_data.has_value())
  36. return false;
  37. bool is_cased = has_property(*unicode_data, Property::Cased);
  38. bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
  39. if (is_cased && !is_case_ignorable)
  40. ++cased_letter_count;
  41. else if (!is_case_ignorable)
  42. cased_letter_count = 0;
  43. }
  44. if (cased_letter_count == 0)
  45. return false;
  46. for (auto code_point : following_view) {
  47. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  48. if (!unicode_data.has_value())
  49. return false;
  50. bool is_cased = has_property(*unicode_data, Property::Cased);
  51. bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
  52. if (is_case_ignorable)
  53. continue;
  54. if (is_cased)
  55. return false;
  56. break;
  57. }
  58. return true;
  59. }
  60. static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  61. {
  62. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  63. auto const* special_casing = unicode_data.special_casing[i];
  64. if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
  65. return special_casing;
  66. // FIXME: Handle locale.
  67. if (special_casing->locale != Locale::None)
  68. continue;
  69. switch (special_casing->condition) {
  70. case Condition::FinalSigma:
  71. if (is_final_code_point(string, index, byte_length))
  72. return special_casing;
  73. break;
  74. default:
  75. break;
  76. }
  77. }
  78. return nullptr;
  79. }
  80. #endif
  81. u32 to_unicode_lowercase(u32 code_point)
  82. {
  83. #if ENABLE_UNICODE_DATA
  84. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  85. if (unicode_data.has_value())
  86. return unicode_data->simple_lowercase_mapping;
  87. return code_point;
  88. #else
  89. return AK::to_ascii_lowercase(code_point);
  90. #endif
  91. }
  92. u32 to_unicode_uppercase(u32 code_point)
  93. {
  94. #if ENABLE_UNICODE_DATA
  95. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  96. if (unicode_data.has_value())
  97. return unicode_data->simple_uppercase_mapping;
  98. return code_point;
  99. #else
  100. return AK::to_ascii_uppercase(code_point);
  101. #endif
  102. }
  103. String to_unicode_lowercase_full(StringView const& string)
  104. {
  105. #if ENABLE_UNICODE_DATA
  106. Utf8View view { string };
  107. StringBuilder builder;
  108. size_t index = 0;
  109. for (auto it = view.begin(); it != view.end(); ++it) {
  110. u32 code_point = *it;
  111. size_t byte_length = it.underlying_code_point_length_in_bytes();
  112. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  113. if (!unicode_data.has_value()) {
  114. builder.append_code_point(code_point);
  115. index += byte_length;
  116. continue;
  117. }
  118. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  119. if (!special_casing) {
  120. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  121. index += byte_length;
  122. continue;
  123. }
  124. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  125. builder.append_code_point(special_casing->lowercase_mapping[i]);
  126. }
  127. return builder.build();
  128. #else
  129. return string.to_lowercase_string();
  130. #endif
  131. }
  132. String to_unicode_uppercase_full(StringView const& string)
  133. {
  134. #if ENABLE_UNICODE_DATA
  135. Utf8View view { string };
  136. StringBuilder builder;
  137. size_t index = 0;
  138. for (auto it = view.begin(); it != view.end(); ++it) {
  139. u32 code_point = *it;
  140. size_t byte_length = it.underlying_code_point_length_in_bytes();
  141. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  142. if (!unicode_data.has_value()) {
  143. builder.append_code_point(code_point);
  144. index += byte_length;
  145. continue;
  146. }
  147. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  148. if (!special_casing) {
  149. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  150. index += byte_length;
  151. continue;
  152. }
  153. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  154. builder.append_code_point(special_casing->uppercase_mapping[i]);
  155. }
  156. return builder.build();
  157. #else
  158. return string.to_uppercase_string();
  159. #endif
  160. }
  161. Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
  162. {
  163. #if ENABLE_UNICODE_DATA
  164. return Detail::property_from_string(property);
  165. #else
  166. return {};
  167. #endif
  168. }
  169. bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
  170. {
  171. #if ENABLE_UNICODE_DATA
  172. if (property == Property::Any)
  173. return is_unicode(code_point);
  174. auto unicode_data = Detail::unicode_data_for_code_point(code_point);
  175. if (!unicode_data.has_value())
  176. return false;
  177. return has_property(*unicode_data, property);
  178. #else
  179. return false;
  180. #endif
  181. }
  182. bool is_ecma262_property([[maybe_unused]] Property property)
  183. {
  184. #if ENABLE_UNICODE_DATA
  185. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  186. // Note: Some of the properties in the above link are not yet parsed by the LibUnicode generator. They are left
  187. // commented out here until they are parsed and can be used.
  188. switch (property) {
  189. case Unicode::Property::ASCII:
  190. case Unicode::Property::ASCII_Hex_Digit:
  191. case Unicode::Property::Alphabetic:
  192. case Unicode::Property::Any:
  193. case Unicode::Property::Assigned:
  194. case Unicode::Property::Bidi_Control:
  195. // case Unicode::Property::Bidi_Mirrored:
  196. case Unicode::Property::Case_Ignorable:
  197. case Unicode::Property::Cased:
  198. case Unicode::Property::Changes_When_Casefolded:
  199. case Unicode::Property::Changes_When_Casemapped:
  200. case Unicode::Property::Changes_When_Lowercased:
  201. // case Unicode::Property::Changes_When_NFKC_Casefolded:
  202. case Unicode::Property::Changes_When_Titlecased:
  203. case Unicode::Property::Changes_When_Uppercased:
  204. case Unicode::Property::Dash:
  205. case Unicode::Property::Default_Ignorable_Code_Point:
  206. case Unicode::Property::Deprecated:
  207. case Unicode::Property::Diacritic:
  208. // case Unicode::Property::Emoji:
  209. // case Unicode::Property::Emoji_Component:
  210. // case Unicode::Property::Emoji_Modifier:
  211. // case Unicode::Property::Emoji_Modifier_Base:
  212. // case Unicode::Property::Emoji_Presentation:
  213. // case Unicode::Property::Extended_Pictographic:
  214. case Unicode::Property::Extender:
  215. case Unicode::Property::Grapheme_Base:
  216. case Unicode::Property::Grapheme_Extend:
  217. case Unicode::Property::Hex_Digit:
  218. case Unicode::Property::IDS_Binary_Operator:
  219. case Unicode::Property::IDS_Trinary_Operator:
  220. case Unicode::Property::ID_Continue:
  221. case Unicode::Property::ID_Start:
  222. case Unicode::Property::Ideographic:
  223. case Unicode::Property::Join_Control:
  224. case Unicode::Property::Logical_Order_Exception:
  225. case Unicode::Property::Lowercase:
  226. case Unicode::Property::Math:
  227. case Unicode::Property::Noncharacter_Code_Point:
  228. case Unicode::Property::Pattern_Syntax:
  229. case Unicode::Property::Pattern_White_Space:
  230. case Unicode::Property::Quotation_Mark:
  231. case Unicode::Property::Radical:
  232. case Unicode::Property::Regional_Indicator:
  233. case Unicode::Property::Sentence_Terminal:
  234. case Unicode::Property::Soft_Dotted:
  235. case Unicode::Property::Terminal_Punctuation:
  236. case Unicode::Property::Unified_Ideograph:
  237. case Unicode::Property::Uppercase:
  238. case Unicode::Property::Variation_Selector:
  239. case Unicode::Property::White_Space:
  240. case Unicode::Property::XID_Continue:
  241. case Unicode::Property::XID_Start:
  242. return true;
  243. default:
  244. return false;
  245. }
  246. #else
  247. return false;
  248. #endif
  249. }
  250. }