CharacterTypes.cpp 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Platform.h>
  7. #include <AK/StringBuilder.h>
  8. #include <AK/Types.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #if ENABLE_UNICODE_DATA
  12. # include <LibUnicode/UnicodeData.h>
  13. #else
  14. # include <AK/CharacterTypes.h>
  15. #endif
  16. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  17. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  18. namespace Unicode {
  19. #if ENABLE_UNICODE_DATA
  20. static bool has_property(UnicodeData const& unicode_data, Property property)
  21. {
  22. for (u32 i = 0; i < unicode_data.prop_list_size; ++i) {
  23. if (unicode_data.prop_list[i] == property)
  24. return true;
  25. }
  26. return false;
  27. }
  28. static bool is_cased_letter(UnicodeData const& unicode_data)
  29. {
  30. // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property
  31. // or has a General_Category value of Titlecase_Letter.
  32. switch (unicode_data.general_category) {
  33. case GeneralCategory::Ll:
  34. case GeneralCategory::Lu:
  35. case GeneralCategory::Lt:
  36. return true;
  37. default:
  38. break;
  39. }
  40. return has_property(unicode_data, Property::OtherLowercase) || has_property(unicode_data, Property::OtherUppercase);
  41. }
  42. static bool is_case_ignorable(UnicodeData const& unicode_data)
  43. {
  44. // A character C is defined to be case-ignorable if C has the value MidLetter (ML),
  45. // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is
  46. // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
  47. // Modifier_Symbol (Sk).
  48. switch (unicode_data.general_category) {
  49. case GeneralCategory::Mn:
  50. case GeneralCategory::Me:
  51. case GeneralCategory::Cf:
  52. case GeneralCategory::Lm:
  53. case GeneralCategory::Sk:
  54. return true;
  55. default:
  56. break;
  57. }
  58. switch (unicode_data.word_break_property) {
  59. case WordBreakProperty::MidLetter:
  60. case WordBreakProperty::MidNumLet:
  61. case WordBreakProperty::SingleQuote:
  62. return true;
  63. default:
  64. break;
  65. }
  66. return false;
  67. }
  68. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  69. {
  70. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  71. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  72. // characters and then a cased letter.
  73. auto preceding_view = string.substring_view(0, index);
  74. auto following_view = ((index + byte_length) < string.byte_length())
  75. ? string.substring_view(index + byte_length)
  76. : Utf8View {};
  77. size_t cased_letter_count = 0;
  78. for (auto code_point : preceding_view) {
  79. auto unicode_data = unicode_data_for_code_point(code_point);
  80. if (!unicode_data.has_value())
  81. return false;
  82. if (is_cased_letter(*unicode_data) && !is_case_ignorable(*unicode_data))
  83. ++cased_letter_count;
  84. else if (!is_case_ignorable(*unicode_data))
  85. cased_letter_count = 0;
  86. }
  87. if (cased_letter_count == 0)
  88. return false;
  89. for (auto code_point : following_view) {
  90. auto unicode_data = unicode_data_for_code_point(code_point);
  91. if (!unicode_data.has_value())
  92. return false;
  93. if (is_case_ignorable(*unicode_data))
  94. continue;
  95. if (is_cased_letter(*unicode_data))
  96. return false;
  97. break;
  98. }
  99. return true;
  100. }
  101. static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  102. {
  103. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  104. auto const* special_casing = unicode_data.special_casing[i];
  105. if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
  106. return special_casing;
  107. // FIXME: Handle locale.
  108. if (special_casing->locale != Locale::None)
  109. continue;
  110. switch (special_casing->condition) {
  111. case Condition::FinalSigma:
  112. if (is_final_code_point(string, index, byte_length))
  113. return special_casing;
  114. break;
  115. default:
  116. break;
  117. }
  118. }
  119. return nullptr;
  120. }
  121. #endif
  122. u32 to_unicode_lowercase(u32 code_point)
  123. {
  124. #if ENABLE_UNICODE_DATA
  125. auto unicode_data = unicode_data_for_code_point(code_point);
  126. if (unicode_data.has_value())
  127. return unicode_data->simple_lowercase_mapping;
  128. return code_point;
  129. #else
  130. return AK::to_ascii_lowercase(code_point);
  131. #endif
  132. }
  133. u32 to_unicode_uppercase(u32 code_point)
  134. {
  135. #if ENABLE_UNICODE_DATA
  136. auto unicode_data = unicode_data_for_code_point(code_point);
  137. if (unicode_data.has_value())
  138. return unicode_data->simple_uppercase_mapping;
  139. return code_point;
  140. #else
  141. return AK::to_ascii_uppercase(code_point);
  142. #endif
  143. }
  144. String to_unicode_lowercase_full(StringView const& string)
  145. {
  146. #if ENABLE_UNICODE_DATA
  147. Utf8View view { string };
  148. StringBuilder builder;
  149. size_t index = 0;
  150. for (auto it = view.begin(); it != view.end(); ++it) {
  151. u32 code_point = *it;
  152. size_t byte_length = it.underlying_code_point_length_in_bytes();
  153. auto unicode_data = unicode_data_for_code_point(code_point);
  154. if (!unicode_data.has_value()) {
  155. builder.append_code_point(code_point);
  156. index += byte_length;
  157. continue;
  158. }
  159. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  160. if (!special_casing) {
  161. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  162. index += byte_length;
  163. continue;
  164. }
  165. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  166. builder.append_code_point(special_casing->lowercase_mapping[i]);
  167. }
  168. return builder.build();
  169. #else
  170. return string.to_lowercase_string();
  171. #endif
  172. }
  173. String to_unicode_uppercase_full(StringView const& string)
  174. {
  175. #if ENABLE_UNICODE_DATA
  176. Utf8View view { string };
  177. StringBuilder builder;
  178. size_t index = 0;
  179. for (auto it = view.begin(); it != view.end(); ++it) {
  180. u32 code_point = *it;
  181. size_t byte_length = it.underlying_code_point_length_in_bytes();
  182. auto unicode_data = unicode_data_for_code_point(code_point);
  183. if (!unicode_data.has_value()) {
  184. builder.append_code_point(code_point);
  185. index += byte_length;
  186. continue;
  187. }
  188. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  189. if (!special_casing) {
  190. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  191. index += byte_length;
  192. continue;
  193. }
  194. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  195. builder.append_code_point(special_casing->uppercase_mapping[i]);
  196. }
  197. return builder.build();
  198. #else
  199. return string.to_uppercase_string();
  200. #endif
  201. }
  202. }