CharacterTypes.cpp 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Platform.h>
  7. #include <AK/StringBuilder.h>
  8. #include <AK/Types.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #if ENABLE_UNICODE_DATA
  12. # include <LibUnicode/UnicodeData.h>
  13. #else
  14. # include <AK/CharacterTypes.h>
  15. #endif
  16. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  17. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  18. namespace Unicode {
  19. #if ENABLE_UNICODE_DATA
  20. static bool has_property(UnicodeData const& unicode_data, Property property)
  21. {
  22. for (u32 i = 0; i < unicode_data.prop_list_size; ++i) {
  23. if (unicode_data.prop_list[i] == property)
  24. return true;
  25. }
  26. return false;
  27. }
  28. static bool is_cased_letter(UnicodeData const& unicode_data)
  29. {
  30. // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property
  31. // or has a General_Category value of Titlecase_Letter.
  32. switch (unicode_data.general_category) {
  33. case GeneralCategory::Ll:
  34. case GeneralCategory::Lu:
  35. case GeneralCategory::Lt:
  36. return true;
  37. default:
  38. break;
  39. }
  40. return has_property(unicode_data, Property::OtherLowercase) || has_property(unicode_data, Property::OtherUppercase);
  41. }
  42. static bool is_case_ignorable(UnicodeData const& unicode_data)
  43. {
  44. // A character C is defined to be case-ignorable if C has the value MidLetter (ML),
  45. // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is
  46. // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
  47. // Modifier_Symbol (Sk).
  48. switch (unicode_data.general_category) {
  49. case GeneralCategory::Mn:
  50. case GeneralCategory::Me:
  51. case GeneralCategory::Cf:
  52. case GeneralCategory::Lm:
  53. case GeneralCategory::Sk:
  54. return true;
  55. default:
  56. // FIXME: Handle word break properties (auxiliary/WordBreakProperty.txt).
  57. return false;
  58. }
  59. }
  60. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  61. {
  62. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  63. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  64. // characters and then a cased letter.
  65. auto preceding_view = string.substring_view(0, index);
  66. auto following_view = ((index + byte_length) < string.byte_length())
  67. ? string.substring_view(index + byte_length)
  68. : Utf8View {};
  69. size_t cased_letter_count = 0;
  70. for (auto code_point : preceding_view) {
  71. auto unicode_data = unicode_data_for_code_point(code_point);
  72. if (!unicode_data.has_value())
  73. return false;
  74. if (is_cased_letter(*unicode_data))
  75. ++cased_letter_count;
  76. else if (!is_case_ignorable(*unicode_data))
  77. cased_letter_count = 0;
  78. }
  79. if (cased_letter_count == 0)
  80. return false;
  81. for (auto code_point : following_view) {
  82. auto unicode_data = unicode_data_for_code_point(code_point);
  83. if (!unicode_data.has_value())
  84. return false;
  85. if (is_case_ignorable(*unicode_data))
  86. continue;
  87. if (is_cased_letter(*unicode_data))
  88. return false;
  89. break;
  90. }
  91. return true;
  92. }
  93. static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  94. {
  95. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  96. auto const* special_casing = unicode_data.special_casing[i];
  97. if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
  98. return special_casing;
  99. // FIXME: Handle locale.
  100. if (special_casing->locale != Locale::None)
  101. continue;
  102. switch (special_casing->condition) {
  103. case Condition::FinalSigma:
  104. if (is_final_code_point(string, index, byte_length))
  105. return special_casing;
  106. break;
  107. default:
  108. break;
  109. }
  110. }
  111. return nullptr;
  112. }
  113. #endif
  114. u32 to_unicode_lowercase(u32 code_point)
  115. {
  116. #if ENABLE_UNICODE_DATA
  117. auto unicode_data = unicode_data_for_code_point(code_point);
  118. if (unicode_data.has_value())
  119. return unicode_data->simple_lowercase_mapping;
  120. return code_point;
  121. #else
  122. return AK::to_ascii_lowercase(code_point);
  123. #endif
  124. }
  125. u32 to_unicode_uppercase(u32 code_point)
  126. {
  127. #if ENABLE_UNICODE_DATA
  128. auto unicode_data = unicode_data_for_code_point(code_point);
  129. if (unicode_data.has_value())
  130. return unicode_data->simple_uppercase_mapping;
  131. return code_point;
  132. #else
  133. return AK::to_ascii_uppercase(code_point);
  134. #endif
  135. }
  136. String to_unicode_lowercase_full(StringView const& string)
  137. {
  138. #if ENABLE_UNICODE_DATA
  139. Utf8View view { string };
  140. StringBuilder builder;
  141. size_t index = 0;
  142. for (auto it = view.begin(); it != view.end(); ++it) {
  143. u32 code_point = *it;
  144. size_t byte_length = it.underlying_code_point_length_in_bytes();
  145. auto unicode_data = unicode_data_for_code_point(code_point);
  146. if (!unicode_data.has_value()) {
  147. builder.append_code_point(code_point);
  148. index += byte_length;
  149. continue;
  150. }
  151. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  152. if (!special_casing) {
  153. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  154. index += byte_length;
  155. continue;
  156. }
  157. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  158. builder.append_code_point(special_casing->lowercase_mapping[i]);
  159. }
  160. return builder.build();
  161. #else
  162. return string.to_lowercase_string();
  163. #endif
  164. }
  165. String to_unicode_uppercase_full(StringView const& string)
  166. {
  167. #if ENABLE_UNICODE_DATA
  168. Utf8View view { string };
  169. StringBuilder builder;
  170. size_t index = 0;
  171. for (auto it = view.begin(); it != view.end(); ++it) {
  172. u32 code_point = *it;
  173. size_t byte_length = it.underlying_code_point_length_in_bytes();
  174. auto unicode_data = unicode_data_for_code_point(code_point);
  175. if (!unicode_data.has_value()) {
  176. builder.append_code_point(code_point);
  177. index += byte_length;
  178. continue;
  179. }
  180. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  181. if (!special_casing) {
  182. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  183. index += byte_length;
  184. continue;
  185. }
  186. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  187. builder.append_code_point(special_casing->uppercase_mapping[i]);
  188. }
  189. return builder.build();
  190. #else
  191. return string.to_uppercase_string();
  192. #endif
  193. }
  194. }