CharacterTypes.cpp 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. /*
  2. * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf16View.h>
  11. #include <AK/Utf32View.h>
  12. #include <AK/Utf8View.h>
  13. #include <LibUnicode/CharacterTypes.h>
  14. #include <LibUnicode/UnicodeUtils.h>
  15. #if ENABLE_UNICODE_DATA
  16. # include <LibUnicode/UnicodeData.h>
  17. #endif
  18. namespace Unicode {
  19. Optional<DeprecatedString> __attribute__((weak)) code_point_display_name(u32) { return {}; }
  20. Optional<StringView> __attribute__((weak)) code_point_block_display_name(u32) { return {}; }
  21. Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
  22. u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
  23. ReadonlySpan<BlockName> __attribute__((weak)) block_display_names() { return {}; }
  24. u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
  25. {
  26. return to_ascii_lowercase(code_point);
  27. }
  28. u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
  29. {
  30. return to_ascii_uppercase(code_point);
  31. }
  32. u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point)
  33. {
  34. return to_ascii_uppercase(code_point);
  35. }
  36. ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView string, Optional<StringView> const& locale)
  37. {
  38. StringBuilder builder;
  39. TRY(Detail::build_lowercase_string(Utf8View { string }, builder, locale));
  40. return builder.to_deprecated_string();
  41. }
  42. ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView string, Optional<StringView> const& locale)
  43. {
  44. StringBuilder builder;
  45. TRY(Detail::build_uppercase_string(Utf8View { string }, builder, locale));
  46. return builder.to_deprecated_string();
  47. }
  48. ErrorOr<String> to_unicode_titlecase_full(StringView string, Optional<StringView> const& locale, TrailingCodePointTransformation trailing_code_point_transformation)
  49. {
  50. StringBuilder builder;
  51. TRY(Detail::build_titlecase_string(Utf8View { string }, builder, locale, trailing_code_point_transformation));
  52. return builder.to_string();
  53. }
  54. ErrorOr<String> to_unicode_casefold_full(StringView string)
  55. {
  56. StringBuilder builder;
  57. TRY(Detail::build_casefold_string(Utf8View { string }, builder));
  58. return builder.to_string();
  59. }
  60. template<typename ViewType>
  61. class CasefoldStringComparator {
  62. public:
  63. explicit CasefoldStringComparator(ViewType string)
  64. : m_string(string)
  65. , m_it(m_string.begin())
  66. {
  67. }
  68. bool has_more_data() const
  69. {
  70. return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
  71. }
  72. u32 next_code_point()
  73. {
  74. VERIFY(has_more_data());
  75. if (m_casefolded_code_points.is_empty()) {
  76. m_current_code_point = *m_it;
  77. ++m_it;
  78. m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
  79. VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
  80. }
  81. auto code_point = m_casefolded_code_points[0];
  82. m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
  83. return code_point;
  84. }
  85. private:
  86. ViewType m_string;
  87. typename ViewType::Iterator m_it;
  88. u32 m_current_code_point { 0 };
  89. Utf32View m_casefolded_code_points;
  90. };
  91. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
  92. template<typename ViewType>
  93. bool equals_ignoring_case(ViewType lhs, ViewType rhs)
  94. {
  95. // A string X is a caseless match for a string Y if and only if:
  96. // toCasefold(X) = toCasefold(Y)
  97. CasefoldStringComparator lhs_comparator { lhs };
  98. CasefoldStringComparator rhs_comparator { rhs };
  99. while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
  100. if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point())
  101. return false;
  102. }
  103. return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data();
  104. }
  105. template bool equals_ignoring_case(Utf8View, Utf8View);
  106. template bool equals_ignoring_case(Utf16View, Utf16View);
  107. template bool equals_ignoring_case(Utf32View, Utf32View);
  108. Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
  109. bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
  110. Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
  111. bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }
  112. bool is_ecma262_property([[maybe_unused]] Property property)
  113. {
  114. #if ENABLE_UNICODE_DATA
  115. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  116. switch (property) {
  117. case Unicode::Property::ASCII:
  118. case Unicode::Property::ASCII_Hex_Digit:
  119. case Unicode::Property::Alphabetic:
  120. case Unicode::Property::Any:
  121. case Unicode::Property::Assigned:
  122. case Unicode::Property::Bidi_Control:
  123. case Unicode::Property::Bidi_Mirrored:
  124. case Unicode::Property::Case_Ignorable:
  125. case Unicode::Property::Cased:
  126. case Unicode::Property::Changes_When_Casefolded:
  127. case Unicode::Property::Changes_When_Casemapped:
  128. case Unicode::Property::Changes_When_Lowercased:
  129. case Unicode::Property::Changes_When_NFKC_Casefolded:
  130. case Unicode::Property::Changes_When_Titlecased:
  131. case Unicode::Property::Changes_When_Uppercased:
  132. case Unicode::Property::Dash:
  133. case Unicode::Property::Default_Ignorable_Code_Point:
  134. case Unicode::Property::Deprecated:
  135. case Unicode::Property::Diacritic:
  136. case Unicode::Property::Emoji:
  137. case Unicode::Property::Emoji_Component:
  138. case Unicode::Property::Emoji_Modifier:
  139. case Unicode::Property::Emoji_Modifier_Base:
  140. case Unicode::Property::Emoji_Presentation:
  141. case Unicode::Property::Extended_Pictographic:
  142. case Unicode::Property::Extender:
  143. case Unicode::Property::Grapheme_Base:
  144. case Unicode::Property::Grapheme_Extend:
  145. case Unicode::Property::Hex_Digit:
  146. case Unicode::Property::IDS_Binary_Operator:
  147. case Unicode::Property::IDS_Trinary_Operator:
  148. case Unicode::Property::ID_Continue:
  149. case Unicode::Property::ID_Start:
  150. case Unicode::Property::Ideographic:
  151. case Unicode::Property::Join_Control:
  152. case Unicode::Property::Logical_Order_Exception:
  153. case Unicode::Property::Lowercase:
  154. case Unicode::Property::Math:
  155. case Unicode::Property::Noncharacter_Code_Point:
  156. case Unicode::Property::Pattern_Syntax:
  157. case Unicode::Property::Pattern_White_Space:
  158. case Unicode::Property::Quotation_Mark:
  159. case Unicode::Property::Radical:
  160. case Unicode::Property::Regional_Indicator:
  161. case Unicode::Property::Sentence_Terminal:
  162. case Unicode::Property::Soft_Dotted:
  163. case Unicode::Property::Terminal_Punctuation:
  164. case Unicode::Property::Unified_Ideograph:
  165. case Unicode::Property::Uppercase:
  166. case Unicode::Property::Variation_Selector:
  167. case Unicode::Property::White_Space:
  168. case Unicode::Property::XID_Continue:
  169. case Unicode::Property::XID_Start:
  170. return true;
  171. default:
  172. return false;
  173. }
  174. #else
  175. return false;
  176. #endif
  177. }
  178. Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
  179. bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
  180. bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
  181. bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
  182. bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
  183. bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
  184. Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
  185. Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
  186. }