CharacterTypes.cpp 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Platform.h>
  7. #include <AK/StringBuilder.h>
  8. #include <AK/Types.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #if ENABLE_UNICODE_DATA
  12. # include <LibUnicode/UnicodeData.h>
  13. #else
  14. # include <AK/CharacterTypes.h>
  15. #endif
  16. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  17. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  18. namespace Unicode {
  19. #if ENABLE_UNICODE_DATA
  20. static bool has_property(UnicodeData const& unicode_data, Property property)
  21. {
  22. return (unicode_data.properties & property) == property;
  23. }
  24. static bool is_cased_letter(UnicodeData const& unicode_data)
  25. {
  26. // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property
  27. // or has a General_Category value of Titlecase_Letter.
  28. switch (unicode_data.general_category) {
  29. case GeneralCategory::Ll:
  30. case GeneralCategory::Lu:
  31. case GeneralCategory::Lt:
  32. return true;
  33. default:
  34. break;
  35. }
  36. return has_property(unicode_data, Property::OtherLowercase) || has_property(unicode_data, Property::OtherUppercase);
  37. }
  38. static bool is_case_ignorable(UnicodeData const& unicode_data)
  39. {
  40. // A character C is defined to be case-ignorable if C has the value MidLetter (ML),
  41. // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is
  42. // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
  43. // Modifier_Symbol (Sk).
  44. switch (unicode_data.general_category) {
  45. case GeneralCategory::Mn:
  46. case GeneralCategory::Me:
  47. case GeneralCategory::Cf:
  48. case GeneralCategory::Lm:
  49. case GeneralCategory::Sk:
  50. return true;
  51. default:
  52. break;
  53. }
  54. switch (unicode_data.word_break_property) {
  55. case WordBreakProperty::MidLetter:
  56. case WordBreakProperty::MidNumLet:
  57. case WordBreakProperty::SingleQuote:
  58. return true;
  59. default:
  60. break;
  61. }
  62. return false;
  63. }
  64. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  65. {
  66. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  67. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  68. // characters and then a cased letter.
  69. auto preceding_view = string.substring_view(0, index);
  70. auto following_view = ((index + byte_length) < string.byte_length())
  71. ? string.substring_view(index + byte_length)
  72. : Utf8View {};
  73. size_t cased_letter_count = 0;
  74. for (auto code_point : preceding_view) {
  75. auto unicode_data = unicode_data_for_code_point(code_point);
  76. if (!unicode_data.has_value())
  77. return false;
  78. if (is_cased_letter(*unicode_data) && !is_case_ignorable(*unicode_data))
  79. ++cased_letter_count;
  80. else if (!is_case_ignorable(*unicode_data))
  81. cased_letter_count = 0;
  82. }
  83. if (cased_letter_count == 0)
  84. return false;
  85. for (auto code_point : following_view) {
  86. auto unicode_data = unicode_data_for_code_point(code_point);
  87. if (!unicode_data.has_value())
  88. return false;
  89. if (is_case_ignorable(*unicode_data))
  90. continue;
  91. if (is_cased_letter(*unicode_data))
  92. return false;
  93. break;
  94. }
  95. return true;
  96. }
  97. static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  98. {
  99. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  100. auto const* special_casing = unicode_data.special_casing[i];
  101. if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
  102. return special_casing;
  103. // FIXME: Handle locale.
  104. if (special_casing->locale != Locale::None)
  105. continue;
  106. switch (special_casing->condition) {
  107. case Condition::FinalSigma:
  108. if (is_final_code_point(string, index, byte_length))
  109. return special_casing;
  110. break;
  111. default:
  112. break;
  113. }
  114. }
  115. return nullptr;
  116. }
  117. #endif
  118. u32 to_unicode_lowercase(u32 code_point)
  119. {
  120. #if ENABLE_UNICODE_DATA
  121. auto unicode_data = unicode_data_for_code_point(code_point);
  122. if (unicode_data.has_value())
  123. return unicode_data->simple_lowercase_mapping;
  124. return code_point;
  125. #else
  126. return AK::to_ascii_lowercase(code_point);
  127. #endif
  128. }
  129. u32 to_unicode_uppercase(u32 code_point)
  130. {
  131. #if ENABLE_UNICODE_DATA
  132. auto unicode_data = unicode_data_for_code_point(code_point);
  133. if (unicode_data.has_value())
  134. return unicode_data->simple_uppercase_mapping;
  135. return code_point;
  136. #else
  137. return AK::to_ascii_uppercase(code_point);
  138. #endif
  139. }
  140. String to_unicode_lowercase_full(StringView const& string)
  141. {
  142. #if ENABLE_UNICODE_DATA
  143. Utf8View view { string };
  144. StringBuilder builder;
  145. size_t index = 0;
  146. for (auto it = view.begin(); it != view.end(); ++it) {
  147. u32 code_point = *it;
  148. size_t byte_length = it.underlying_code_point_length_in_bytes();
  149. auto unicode_data = unicode_data_for_code_point(code_point);
  150. if (!unicode_data.has_value()) {
  151. builder.append_code_point(code_point);
  152. index += byte_length;
  153. continue;
  154. }
  155. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  156. if (!special_casing) {
  157. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  158. index += byte_length;
  159. continue;
  160. }
  161. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  162. builder.append_code_point(special_casing->lowercase_mapping[i]);
  163. }
  164. return builder.build();
  165. #else
  166. return string.to_lowercase_string();
  167. #endif
  168. }
  169. String to_unicode_uppercase_full(StringView const& string)
  170. {
  171. #if ENABLE_UNICODE_DATA
  172. Utf8View view { string };
  173. StringBuilder builder;
  174. size_t index = 0;
  175. for (auto it = view.begin(); it != view.end(); ++it) {
  176. u32 code_point = *it;
  177. size_t byte_length = it.underlying_code_point_length_in_bytes();
  178. auto unicode_data = unicode_data_for_code_point(code_point);
  179. if (!unicode_data.has_value()) {
  180. builder.append_code_point(code_point);
  181. index += byte_length;
  182. continue;
  183. }
  184. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  185. if (!special_casing) {
  186. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  187. index += byte_length;
  188. continue;
  189. }
  190. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  191. builder.append_code_point(special_casing->uppercase_mapping[i]);
  192. }
  193. return builder.build();
  194. #else
  195. return string.to_uppercase_string();
  196. #endif
  197. }
  198. }