CharacterTypes.cpp 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. /*
  2. * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/ScopeGuard.h>
  9. #include <AK/StringBuilder.h>
  10. #include <AK/Types.h>
  11. #include <AK/Utf16View.h>
  12. #include <AK/Utf32View.h>
  13. #include <AK/Utf8View.h>
  14. #include <LibUnicode/CharacterTypes.h>
  15. #include <LibUnicode/UnicodeUtils.h>
  16. #if ENABLE_UNICODE_DATA
  17. # include <LibUnicode/UnicodeData.h>
  18. #endif
  19. namespace Unicode {
  20. u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
  21. u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
  22. {
  23. return to_ascii_lowercase(code_point);
  24. }
  25. u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
  26. {
  27. return to_ascii_uppercase(code_point);
  28. }
  29. u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point)
  30. {
  31. return to_ascii_uppercase(code_point);
  32. }
  33. template<typename ViewType>
  34. class CasefoldStringComparator {
  35. public:
  36. explicit CasefoldStringComparator(ViewType string)
  37. : m_string(string)
  38. , m_it(m_string.begin())
  39. {
  40. }
  41. bool has_more_data() const
  42. {
  43. return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
  44. }
  45. size_t index() const
  46. {
  47. if constexpr (IsSame<ViewType, Utf8View>)
  48. return m_string.byte_offset_of(m_it);
  49. else if constexpr (IsSame<ViewType, Utf16View>)
  50. return m_string.code_unit_offset_of(m_it);
  51. else if constexpr (IsSame<ViewType, Utf32View>)
  52. return m_string.iterator_offset(m_it);
  53. else
  54. static_assert(DependentFalse<ViewType>);
  55. }
  56. u32 next_code_point()
  57. {
  58. VERIFY(has_more_data());
  59. if (m_casefolded_code_points.is_empty()) {
  60. m_current_code_point = *m_it;
  61. ++m_it;
  62. m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
  63. VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
  64. }
  65. auto code_point = m_casefolded_code_points[0];
  66. m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
  67. return code_point;
  68. }
  69. private:
  70. ViewType m_string;
  71. typename ViewType::Iterator m_it;
  72. u32 m_current_code_point { 0 };
  73. Utf32View m_casefolded_code_points;
  74. };
  75. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
  76. template<typename ViewType>
  77. bool equals_ignoring_case(ViewType lhs, ViewType rhs)
  78. {
  79. // A string X is a caseless match for a string Y if and only if:
  80. // toCasefold(X) = toCasefold(Y)
  81. CasefoldStringComparator lhs_comparator { lhs };
  82. CasefoldStringComparator rhs_comparator { rhs };
  83. while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
  84. if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point())
  85. return false;
  86. }
  87. return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data();
  88. }
  89. template bool equals_ignoring_case(Utf8View, Utf8View);
  90. template bool equals_ignoring_case(Utf16View, Utf16View);
  91. template bool equals_ignoring_case(Utf32View, Utf32View);
  92. template<typename ViewType>
  93. Optional<size_t> find_ignoring_case(ViewType lhs, ViewType rhs)
  94. {
  95. CasefoldStringComparator lhs_comparator { lhs };
  96. while (lhs_comparator.has_more_data()) {
  97. CasefoldStringComparator rhs_comparator { rhs };
  98. auto saved_state = lhs_comparator;
  99. auto matches = true;
  100. while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
  101. if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) {
  102. matches = false;
  103. break;
  104. }
  105. }
  106. if (matches && !rhs_comparator.has_more_data())
  107. return saved_state.index();
  108. lhs_comparator = move(saved_state);
  109. lhs_comparator.next_code_point();
  110. }
  111. return {};
  112. }
  113. template Optional<size_t> find_ignoring_case(Utf8View, Utf8View);
  114. template Optional<size_t> find_ignoring_case(Utf16View, Utf16View);
  115. template Optional<size_t> find_ignoring_case(Utf32View, Utf32View);
  116. Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
  117. bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
  118. Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
  119. bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }
  120. bool is_ecma262_property([[maybe_unused]] Property property)
  121. {
  122. #if ENABLE_UNICODE_DATA
  123. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  124. switch (property) {
  125. case Unicode::Property::ASCII:
  126. case Unicode::Property::ASCII_Hex_Digit:
  127. case Unicode::Property::Alphabetic:
  128. case Unicode::Property::Any:
  129. case Unicode::Property::Assigned:
  130. case Unicode::Property::Bidi_Control:
  131. case Unicode::Property::Bidi_Mirrored:
  132. case Unicode::Property::Case_Ignorable:
  133. case Unicode::Property::Cased:
  134. case Unicode::Property::Changes_When_Casefolded:
  135. case Unicode::Property::Changes_When_Casemapped:
  136. case Unicode::Property::Changes_When_Lowercased:
  137. case Unicode::Property::Changes_When_NFKC_Casefolded:
  138. case Unicode::Property::Changes_When_Titlecased:
  139. case Unicode::Property::Changes_When_Uppercased:
  140. case Unicode::Property::Dash:
  141. case Unicode::Property::Default_Ignorable_Code_Point:
  142. case Unicode::Property::Deprecated:
  143. case Unicode::Property::Diacritic:
  144. case Unicode::Property::Emoji:
  145. case Unicode::Property::Emoji_Component:
  146. case Unicode::Property::Emoji_Modifier:
  147. case Unicode::Property::Emoji_Modifier_Base:
  148. case Unicode::Property::Emoji_Presentation:
  149. case Unicode::Property::Extended_Pictographic:
  150. case Unicode::Property::Extender:
  151. case Unicode::Property::Grapheme_Base:
  152. case Unicode::Property::Grapheme_Extend:
  153. case Unicode::Property::Hex_Digit:
  154. case Unicode::Property::IDS_Binary_Operator:
  155. case Unicode::Property::IDS_Trinary_Operator:
  156. case Unicode::Property::ID_Continue:
  157. case Unicode::Property::ID_Start:
  158. case Unicode::Property::Ideographic:
  159. case Unicode::Property::Join_Control:
  160. case Unicode::Property::Logical_Order_Exception:
  161. case Unicode::Property::Lowercase:
  162. case Unicode::Property::Math:
  163. case Unicode::Property::Noncharacter_Code_Point:
  164. case Unicode::Property::Pattern_Syntax:
  165. case Unicode::Property::Pattern_White_Space:
  166. case Unicode::Property::Quotation_Mark:
  167. case Unicode::Property::Radical:
  168. case Unicode::Property::Regional_Indicator:
  169. case Unicode::Property::Sentence_Terminal:
  170. case Unicode::Property::Soft_Dotted:
  171. case Unicode::Property::Terminal_Punctuation:
  172. case Unicode::Property::Unified_Ideograph:
  173. case Unicode::Property::Uppercase:
  174. case Unicode::Property::Variation_Selector:
  175. case Unicode::Property::White_Space:
  176. case Unicode::Property::XID_Continue:
  177. case Unicode::Property::XID_Start:
  178. return true;
  179. default:
  180. return false;
  181. }
  182. #else
  183. return false;
  184. #endif
  185. }
  186. Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
  187. bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
  188. bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
  189. bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
  190. bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
  191. bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
  192. Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
  193. Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
  194. }