CharacterTypes.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. /*
  2. * Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Array.h>
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Find.h>
  9. #include <AK/Traits.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/ICU.h>
  12. #include <unicode/uchar.h>
  13. #include <unicode/uscript.h>
  14. namespace Unicode {
  15. template<typename PropertyType>
  16. struct PropertyName {
  17. Optional<StringView> long_name;
  18. Optional<StringView> short_name;
  19. Optional<StringView> additional_name;
  20. };
  21. // From uchar.h:
  22. // Unicode allows for additional names, beyond the long and short name, which would be indicated by U_LONG_PROPERTY_NAME + i
  23. static constexpr auto ADDITIONAL_NAME = static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + 1);
  24. }
  25. template<typename PropertyType>
  26. struct AK::Traits<Unicode::PropertyName<PropertyType>> {
  27. static constexpr bool equals(Unicode::PropertyName<PropertyType> const& candidate, StringView property)
  28. {
  29. return property == candidate.long_name || property == candidate.short_name || property == candidate.additional_name;
  30. }
  31. };
  32. namespace Unicode {
  33. static constexpr GeneralCategory GENERAL_CATEGORY_CASED_LETTER = U_CHAR_CATEGORY_COUNT + 1;
  34. static constexpr GeneralCategory GENERAL_CATEGORY_LETTER = U_CHAR_CATEGORY_COUNT + 2;
  35. static constexpr GeneralCategory GENERAL_CATEGORY_MARK = U_CHAR_CATEGORY_COUNT + 3;
  36. static constexpr GeneralCategory GENERAL_CATEGORY_NUMBER = U_CHAR_CATEGORY_COUNT + 4;
  37. static constexpr GeneralCategory GENERAL_CATEGORY_PUNCTUATION = U_CHAR_CATEGORY_COUNT + 5;
  38. static constexpr GeneralCategory GENERAL_CATEGORY_SYMBOL = U_CHAR_CATEGORY_COUNT + 6;
  39. static constexpr GeneralCategory GENERAL_CATEGORY_SEPARATOR = U_CHAR_CATEGORY_COUNT + 7;
  40. static constexpr GeneralCategory GENERAL_CATEGORY_OTHER = U_CHAR_CATEGORY_COUNT + 8;
  41. static constexpr GeneralCategory GENERAL_CATEGORY_LIMIT = U_CHAR_CATEGORY_COUNT + 9;
  42. Optional<GeneralCategory> general_category_from_string(StringView general_category)
  43. {
  44. static auto general_category_names = []() {
  45. Array<PropertyName<GeneralCategory>, GENERAL_CATEGORY_LIMIT.value()> names;
  46. auto set_names = [&](auto property, auto index, auto general_category) {
  47. if (char const* name = u_getPropertyValueName(property, general_category, U_LONG_PROPERTY_NAME))
  48. names[index.value()].long_name = StringView { name, strlen(name) };
  49. if (char const* name = u_getPropertyValueName(property, general_category, U_SHORT_PROPERTY_NAME))
  50. names[index.value()].short_name = StringView { name, strlen(name) };
  51. if (char const* name = u_getPropertyValueName(property, general_category, ADDITIONAL_NAME))
  52. names[index.value()].additional_name = StringView { name, strlen(name) };
  53. };
  54. for (GeneralCategory general_category = 0; general_category < U_CHAR_CATEGORY_COUNT; ++general_category)
  55. set_names(UCHAR_GENERAL_CATEGORY, general_category, static_cast<UCharCategory>(general_category.value()));
  56. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_CASED_LETTER, U_GC_LC_MASK);
  57. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_LETTER, U_GC_L_MASK);
  58. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_MARK, U_GC_M_MASK);
  59. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_NUMBER, U_GC_N_MASK);
  60. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_PUNCTUATION, U_GC_P_MASK);
  61. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SYMBOL, U_GC_S_MASK);
  62. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SEPARATOR, U_GC_Z_MASK);
  63. set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_OTHER, U_GC_C_MASK);
  64. return names;
  65. }();
  66. if (auto index = find_index(general_category_names.begin(), general_category_names.end(), general_category); index != general_category_names.size())
  67. return static_cast<GeneralCategory>(index);
  68. return {};
  69. }
  70. bool code_point_has_general_category(u32 code_point, GeneralCategory general_category)
  71. {
  72. auto icu_code_point = static_cast<UChar32>(code_point);
  73. auto icu_general_category = static_cast<UCharCategory>(general_category.value());
  74. if (general_category == GENERAL_CATEGORY_CASED_LETTER)
  75. return (U_GET_GC_MASK(icu_code_point) & U_GC_LC_MASK) != 0;
  76. if (general_category == GENERAL_CATEGORY_LETTER)
  77. return (U_GET_GC_MASK(icu_code_point) & U_GC_L_MASK) != 0;
  78. if (general_category == GENERAL_CATEGORY_MARK)
  79. return (U_GET_GC_MASK(icu_code_point) & U_GC_M_MASK) != 0;
  80. if (general_category == GENERAL_CATEGORY_NUMBER)
  81. return (U_GET_GC_MASK(icu_code_point) & U_GC_N_MASK) != 0;
  82. if (general_category == GENERAL_CATEGORY_PUNCTUATION)
  83. return (U_GET_GC_MASK(icu_code_point) & U_GC_P_MASK) != 0;
  84. if (general_category == GENERAL_CATEGORY_SYMBOL)
  85. return (U_GET_GC_MASK(icu_code_point) & U_GC_S_MASK) != 0;
  86. if (general_category == GENERAL_CATEGORY_SEPARATOR)
  87. return (U_GET_GC_MASK(icu_code_point) & U_GC_Z_MASK) != 0;
  88. if (general_category == GENERAL_CATEGORY_OTHER)
  89. return (U_GET_GC_MASK(icu_code_point) & U_GC_C_MASK) != 0;
  90. return u_charType(icu_code_point) == icu_general_category;
  91. }
  92. bool code_point_is_printable(u32 code_point)
  93. {
  94. return static_cast<bool>(u_isprint(static_cast<UChar32>(code_point)));
  95. }
  96. bool code_point_has_control_general_category(u32 code_point)
  97. {
  98. return code_point_has_general_category(code_point, U_CONTROL_CHAR);
  99. }
  100. bool code_point_has_letter_general_category(u32 code_point)
  101. {
  102. return code_point_has_general_category(code_point, GENERAL_CATEGORY_LETTER);
  103. }
  104. bool code_point_has_number_general_category(u32 code_point)
  105. {
  106. return code_point_has_general_category(code_point, GENERAL_CATEGORY_NUMBER);
  107. }
  108. bool code_point_has_punctuation_general_category(u32 code_point)
  109. {
  110. return code_point_has_general_category(code_point, GENERAL_CATEGORY_PUNCTUATION);
  111. }
  112. bool code_point_has_separator_general_category(u32 code_point)
  113. {
  114. return code_point_has_general_category(code_point, GENERAL_CATEGORY_SEPARATOR);
  115. }
  116. bool code_point_has_space_separator_general_category(u32 code_point)
  117. {
  118. return code_point_has_general_category(code_point, U_SPACE_SEPARATOR);
  119. }
  120. bool code_point_has_symbol_general_category(u32 code_point)
  121. {
  122. return code_point_has_general_category(code_point, GENERAL_CATEGORY_SYMBOL);
  123. }
  124. static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1;
  125. static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2;
  126. static constexpr Property PROPERTY_ASSIGNED = UCHAR_BINARY_LIMIT + 3;
  127. static constexpr Property PROPERTY_LIMIT = UCHAR_BINARY_LIMIT + 4;
  128. Optional<Property> property_from_string(StringView property)
  129. {
  130. static auto property_names = []() {
  131. Array<PropertyName<Property>, PROPERTY_LIMIT.value()> names;
  132. for (Property property = 0; property < UCHAR_BINARY_LIMIT; ++property) {
  133. auto icu_property = static_cast<UProperty>(property.value());
  134. if (char const* name = u_getPropertyName(icu_property, U_LONG_PROPERTY_NAME))
  135. names[property.value()].long_name = StringView { name, strlen(name) };
  136. if (char const* name = u_getPropertyName(icu_property, U_SHORT_PROPERTY_NAME))
  137. names[property.value()].short_name = StringView { name, strlen(name) };
  138. if (char const* name = u_getPropertyName(icu_property, ADDITIONAL_NAME))
  139. names[property.value()].additional_name = StringView { name, strlen(name) };
  140. }
  141. names[PROPERTY_ANY.value()] = { "Any"sv, {}, {} };
  142. names[PROPERTY_ASCII.value()] = { "ASCII"sv, {}, {} };
  143. names[PROPERTY_ASSIGNED.value()] = { "Assigned"sv, {}, {} };
  144. return names;
  145. }();
  146. if (auto index = find_index(property_names.begin(), property_names.end(), property); index != property_names.size())
  147. return static_cast<Property>(index);
  148. return {};
  149. }
  150. bool code_point_has_property(u32 code_point, Property property)
  151. {
  152. auto icu_code_point = static_cast<UChar32>(code_point);
  153. auto icu_property = static_cast<UProperty>(property.value());
  154. if (property == PROPERTY_ANY)
  155. return is_unicode(code_point);
  156. if (property == PROPERTY_ASCII)
  157. return is_ascii(code_point);
  158. if (property == PROPERTY_ASSIGNED)
  159. return u_isdefined(icu_code_point);
  160. return static_cast<bool>(u_hasBinaryProperty(icu_code_point, icu_property));
  161. }
  162. bool code_point_has_emoji_property(u32 code_point)
  163. {
  164. return code_point_has_property(code_point, UCHAR_EMOJI);
  165. }
  166. bool code_point_has_emoji_modifier_base_property(u32 code_point)
  167. {
  168. return code_point_has_property(code_point, UCHAR_EMOJI_MODIFIER_BASE);
  169. }
  170. bool code_point_has_emoji_presentation_property(u32 code_point)
  171. {
  172. return code_point_has_property(code_point, UCHAR_EMOJI_PRESENTATION);
  173. }
  174. bool code_point_has_identifier_start_property(u32 code_point)
  175. {
  176. return u_isIDStart(static_cast<UChar32>(code_point));
  177. }
  178. bool code_point_has_identifier_continue_property(u32 code_point)
  179. {
  180. return u_isIDPart(static_cast<UChar32>(code_point));
  181. }
  182. bool code_point_has_regional_indicator_property(u32 code_point)
  183. {
  184. return code_point_has_property(code_point, UCHAR_REGIONAL_INDICATOR);
  185. }
  186. bool code_point_has_variation_selector_property(u32 code_point)
  187. {
  188. return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR);
  189. }
  190. // https://tc39.es/ecma262/#table-binary-unicode-properties
  191. bool is_ecma262_property(Property property)
  192. {
  193. if (property == PROPERTY_ANY || property == PROPERTY_ASCII || property == PROPERTY_ASSIGNED)
  194. return true;
  195. switch (property.value()) {
  196. case UCHAR_ASCII_HEX_DIGIT:
  197. case UCHAR_ALPHABETIC:
  198. case UCHAR_BIDI_CONTROL:
  199. case UCHAR_BIDI_MIRRORED:
  200. case UCHAR_CASE_IGNORABLE:
  201. case UCHAR_CASED:
  202. case UCHAR_CHANGES_WHEN_CASEFOLDED:
  203. case UCHAR_CHANGES_WHEN_CASEMAPPED:
  204. case UCHAR_CHANGES_WHEN_LOWERCASED:
  205. case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
  206. case UCHAR_CHANGES_WHEN_TITLECASED:
  207. case UCHAR_CHANGES_WHEN_UPPERCASED:
  208. case UCHAR_DASH:
  209. case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
  210. case UCHAR_DEPRECATED:
  211. case UCHAR_DIACRITIC:
  212. case UCHAR_EMOJI:
  213. case UCHAR_EMOJI_COMPONENT:
  214. case UCHAR_EMOJI_MODIFIER:
  215. case UCHAR_EMOJI_MODIFIER_BASE:
  216. case UCHAR_EMOJI_PRESENTATION:
  217. case UCHAR_EXTENDED_PICTOGRAPHIC:
  218. case UCHAR_EXTENDER:
  219. case UCHAR_GRAPHEME_BASE:
  220. case UCHAR_GRAPHEME_EXTEND:
  221. case UCHAR_HEX_DIGIT:
  222. case UCHAR_IDS_BINARY_OPERATOR:
  223. case UCHAR_IDS_TRINARY_OPERATOR:
  224. case UCHAR_ID_CONTINUE:
  225. case UCHAR_ID_START:
  226. case UCHAR_IDEOGRAPHIC:
  227. case UCHAR_JOIN_CONTROL:
  228. case UCHAR_LOGICAL_ORDER_EXCEPTION:
  229. case UCHAR_LOWERCASE:
  230. case UCHAR_MATH:
  231. case UCHAR_NONCHARACTER_CODE_POINT:
  232. case UCHAR_PATTERN_SYNTAX:
  233. case UCHAR_PATTERN_WHITE_SPACE:
  234. case UCHAR_QUOTATION_MARK:
  235. case UCHAR_RADICAL:
  236. case UCHAR_REGIONAL_INDICATOR:
  237. case UCHAR_S_TERM:
  238. case UCHAR_SOFT_DOTTED:
  239. case UCHAR_TERMINAL_PUNCTUATION:
  240. case UCHAR_UNIFIED_IDEOGRAPH:
  241. case UCHAR_UPPERCASE:
  242. case UCHAR_VARIATION_SELECTOR:
  243. case UCHAR_WHITE_SPACE:
  244. case UCHAR_XID_CONTINUE:
  245. case UCHAR_XID_START:
  246. return true;
  247. default:
  248. return false;
  249. }
  250. }
  251. Optional<Script> script_from_string(StringView script)
  252. {
  253. static auto script_names = []() {
  254. Array<PropertyName<Script>, static_cast<size_t>(USCRIPT_CODE_LIMIT)> names;
  255. for (Script script = 0; script < USCRIPT_CODE_LIMIT; ++script) {
  256. auto icu_script = static_cast<UScriptCode>(script.value());
  257. if (char const* name = uscript_getName(icu_script))
  258. names[script.value()].long_name = StringView { name, strlen(name) };
  259. if (char const* name = uscript_getShortName(icu_script))
  260. names[script.value()].short_name = StringView { name, strlen(name) };
  261. if (char const* name = u_getPropertyValueName(UCHAR_SCRIPT, icu_script, ADDITIONAL_NAME))
  262. names[script.value()].additional_name = StringView { name, strlen(name) };
  263. }
  264. return names;
  265. }();
  266. if (auto index = find_index(script_names.begin(), script_names.end(), script); index != script_names.size())
  267. return static_cast<Script>(index);
  268. return {};
  269. }
  270. bool code_point_has_script(u32 code_point, Script script)
  271. {
  272. UErrorCode status = U_ZERO_ERROR;
  273. auto icu_code_point = static_cast<UChar32>(code_point);
  274. auto icu_script = static_cast<UScriptCode>(script.value());
  275. if (auto result = uscript_getScript(icu_code_point, &status); icu_success(status))
  276. return result == icu_script;
  277. return false;
  278. }
  279. bool code_point_has_script_extension(u32 code_point, Script script)
  280. {
  281. auto icu_code_point = static_cast<UChar32>(code_point);
  282. auto icu_script = static_cast<UScriptCode>(script.value());
  283. return static_cast<bool>(uscript_hasScript(icu_code_point, icu_script));
  284. }
  285. static constexpr BidiClass char_direction_to_bidi_class(UCharDirection direction)
  286. {
  287. switch (direction) {
  288. case U_ARABIC_NUMBER:
  289. return BidiClass::ArabicNumber;
  290. case U_BLOCK_SEPARATOR:
  291. return BidiClass::BlockSeparator;
  292. case U_BOUNDARY_NEUTRAL:
  293. return BidiClass::BoundaryNeutral;
  294. case U_COMMON_NUMBER_SEPARATOR:
  295. return BidiClass::CommonNumberSeparator;
  296. case U_DIR_NON_SPACING_MARK:
  297. return BidiClass::DirNonSpacingMark;
  298. case U_EUROPEAN_NUMBER:
  299. return BidiClass::EuropeanNumber;
  300. case U_EUROPEAN_NUMBER_SEPARATOR:
  301. return BidiClass::EuropeanNumberSeparator;
  302. case U_EUROPEAN_NUMBER_TERMINATOR:
  303. return BidiClass::EuropeanNumberTerminator;
  304. case U_FIRST_STRONG_ISOLATE:
  305. return BidiClass::FirstStrongIsolate;
  306. case U_LEFT_TO_RIGHT:
  307. return BidiClass::LeftToRight;
  308. case U_LEFT_TO_RIGHT_EMBEDDING:
  309. return BidiClass::LeftToRightEmbedding;
  310. case U_LEFT_TO_RIGHT_ISOLATE:
  311. return BidiClass::LeftToRightIsolate;
  312. case U_LEFT_TO_RIGHT_OVERRIDE:
  313. return BidiClass::LeftToRightOverride;
  314. case U_OTHER_NEUTRAL:
  315. return BidiClass::OtherNeutral;
  316. case U_POP_DIRECTIONAL_FORMAT:
  317. return BidiClass::PopDirectionalFormat;
  318. case U_POP_DIRECTIONAL_ISOLATE:
  319. return BidiClass::PopDirectionalIsolate;
  320. case U_RIGHT_TO_LEFT:
  321. return BidiClass::RightToLeft;
  322. case U_RIGHT_TO_LEFT_ARABIC:
  323. return BidiClass::RightToLeftArabic;
  324. case U_RIGHT_TO_LEFT_EMBEDDING:
  325. return BidiClass::RightToLeftEmbedding;
  326. case U_RIGHT_TO_LEFT_ISOLATE:
  327. return BidiClass::RightToLeftIsolate;
  328. case U_RIGHT_TO_LEFT_OVERRIDE:
  329. return BidiClass::RightToLeftOverride;
  330. case U_SEGMENT_SEPARATOR:
  331. return BidiClass::SegmentSeparator;
  332. case U_WHITE_SPACE_NEUTRAL:
  333. return BidiClass::WhiteSpaceNeutral;
  334. case U_CHAR_DIRECTION_COUNT:
  335. break;
  336. }
  337. VERIFY_NOT_REACHED();
  338. }
  339. BidiClass bidirectional_class(u32 code_point)
  340. {
  341. auto icu_code_point = static_cast<UChar32>(code_point);
  342. auto direction = u_charDirection(icu_code_point);
  343. return char_direction_to_bidi_class(direction);
  344. }
  345. }