CharacterTypes.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. /*
  2. * Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Array.h>
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Find.h>
  9. #include <AK/Traits.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <unicode/uchar.h>
  12. namespace Unicode {
  13. template<typename PropertyType>
  14. struct PropertyName {
  15. Optional<StringView> long_name;
  16. Optional<StringView> short_name;
  17. Optional<StringView> additional_name;
  18. };
  19. // From uchar.h:
  20. // Unicode allows for additional names, beyond the long and short name, which would be indicated by U_LONG_PROPERTY_NAME + i
  21. static constexpr auto ADDITIONAL_NAME = static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + 1);
  22. }
  23. template<typename PropertyType>
  24. struct AK::Traits<Unicode::PropertyName<PropertyType>> {
  25. static constexpr bool equals(Unicode::PropertyName<PropertyType> const& candidate, StringView property)
  26. {
  27. return property == candidate.long_name || property == candidate.short_name || property == candidate.additional_name;
  28. }
  29. };
  30. namespace Unicode {
  31. Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
  32. bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
  33. static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1;
  34. static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2;
  35. static constexpr Property PROPERTY_ASSIGNED = UCHAR_BINARY_LIMIT + 3;
  36. static constexpr Property PROPERTY_LIMIT = UCHAR_BINARY_LIMIT + 4;
  37. Optional<Property> property_from_string(StringView property)
  38. {
  39. static auto property_names = []() {
  40. Array<PropertyName<Property>, PROPERTY_LIMIT.value()> names;
  41. for (Property property = 0; property < UCHAR_BINARY_LIMIT; ++property) {
  42. auto icu_property = static_cast<UProperty>(property.value());
  43. if (char const* name = u_getPropertyName(icu_property, U_LONG_PROPERTY_NAME))
  44. names[property.value()].long_name = StringView { name, strlen(name) };
  45. if (char const* name = u_getPropertyName(icu_property, U_SHORT_PROPERTY_NAME))
  46. names[property.value()].short_name = StringView { name, strlen(name) };
  47. if (char const* name = u_getPropertyName(icu_property, ADDITIONAL_NAME))
  48. names[property.value()].additional_name = StringView { name, strlen(name) };
  49. }
  50. names[PROPERTY_ANY.value()] = { "Any"sv, {}, {} };
  51. names[PROPERTY_ASCII.value()] = { "ASCII"sv, {}, {} };
  52. names[PROPERTY_ASSIGNED.value()] = { "Assigned"sv, {}, {} };
  53. return names;
  54. }();
  55. if (auto index = find_index(property_names.begin(), property_names.end(), property); index != property_names.size())
  56. return static_cast<Property>(index);
  57. return {};
  58. }
  59. bool code_point_has_property(u32 code_point, Property property)
  60. {
  61. auto icu_code_point = static_cast<UChar32>(code_point);
  62. auto icu_property = static_cast<UProperty>(property.value());
  63. if (property == PROPERTY_ANY)
  64. return is_unicode(code_point);
  65. if (property == PROPERTY_ASCII)
  66. return is_ascii(code_point);
  67. if (property == PROPERTY_ASSIGNED)
  68. return u_isdefined(icu_code_point);
  69. return static_cast<bool>(u_hasBinaryProperty(icu_code_point, icu_property));
  70. }
  71. bool code_point_has_emoji_property(u32 code_point)
  72. {
  73. return code_point_has_property(code_point, UCHAR_EMOJI);
  74. }
  75. bool code_point_has_emoji_modifier_base_property(u32 code_point)
  76. {
  77. return code_point_has_property(code_point, UCHAR_EMOJI_MODIFIER_BASE);
  78. }
  79. bool code_point_has_emoji_presentation_property(u32 code_point)
  80. {
  81. return code_point_has_property(code_point, UCHAR_EMOJI_PRESENTATION);
  82. }
  83. bool code_point_has_identifier_start_property(u32 code_point)
  84. {
  85. return u_isIDStart(static_cast<UChar32>(code_point));
  86. }
  87. bool code_point_has_identifier_continue_property(u32 code_point)
  88. {
  89. return u_isIDPart(static_cast<UChar32>(code_point));
  90. }
  91. bool code_point_has_regional_indicator_property(u32 code_point)
  92. {
  93. return code_point_has_property(code_point, UCHAR_REGIONAL_INDICATOR);
  94. }
  95. bool code_point_has_variation_selector_property(u32 code_point)
  96. {
  97. return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR);
  98. }
  99. // https://tc39.es/ecma262/#table-binary-unicode-properties
  100. bool is_ecma262_property(Property property)
  101. {
  102. if (property == PROPERTY_ANY || property == PROPERTY_ASCII || property == PROPERTY_ASSIGNED)
  103. return true;
  104. switch (property.value()) {
  105. case UCHAR_ASCII_HEX_DIGIT:
  106. case UCHAR_ALPHABETIC:
  107. case UCHAR_BIDI_CONTROL:
  108. case UCHAR_BIDI_MIRRORED:
  109. case UCHAR_CASE_IGNORABLE:
  110. case UCHAR_CASED:
  111. case UCHAR_CHANGES_WHEN_CASEFOLDED:
  112. case UCHAR_CHANGES_WHEN_CASEMAPPED:
  113. case UCHAR_CHANGES_WHEN_LOWERCASED:
  114. case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
  115. case UCHAR_CHANGES_WHEN_TITLECASED:
  116. case UCHAR_CHANGES_WHEN_UPPERCASED:
  117. case UCHAR_DASH:
  118. case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
  119. case UCHAR_DEPRECATED:
  120. case UCHAR_DIACRITIC:
  121. case UCHAR_EMOJI:
  122. case UCHAR_EMOJI_COMPONENT:
  123. case UCHAR_EMOJI_MODIFIER:
  124. case UCHAR_EMOJI_MODIFIER_BASE:
  125. case UCHAR_EMOJI_PRESENTATION:
  126. case UCHAR_EXTENDED_PICTOGRAPHIC:
  127. case UCHAR_EXTENDER:
  128. case UCHAR_GRAPHEME_BASE:
  129. case UCHAR_GRAPHEME_EXTEND:
  130. case UCHAR_HEX_DIGIT:
  131. case UCHAR_IDS_BINARY_OPERATOR:
  132. case UCHAR_IDS_TRINARY_OPERATOR:
  133. case UCHAR_ID_CONTINUE:
  134. case UCHAR_ID_START:
  135. case UCHAR_IDEOGRAPHIC:
  136. case UCHAR_JOIN_CONTROL:
  137. case UCHAR_LOGICAL_ORDER_EXCEPTION:
  138. case UCHAR_LOWERCASE:
  139. case UCHAR_MATH:
  140. case UCHAR_NONCHARACTER_CODE_POINT:
  141. case UCHAR_PATTERN_SYNTAX:
  142. case UCHAR_PATTERN_WHITE_SPACE:
  143. case UCHAR_QUOTATION_MARK:
  144. case UCHAR_RADICAL:
  145. case UCHAR_REGIONAL_INDICATOR:
  146. case UCHAR_S_TERM:
  147. case UCHAR_SOFT_DOTTED:
  148. case UCHAR_TERMINAL_PUNCTUATION:
  149. case UCHAR_UNIFIED_IDEOGRAPH:
  150. case UCHAR_UPPERCASE:
  151. case UCHAR_VARIATION_SELECTOR:
  152. case UCHAR_WHITE_SPACE:
  153. case UCHAR_XID_CONTINUE:
  154. case UCHAR_XID_START:
  155. return true;
  156. default:
  157. return false;
  158. }
  159. }
  160. Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
  161. bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
  162. bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
  163. Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
  164. Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
  165. }