CharacterTypes.cpp 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Platform.h>
  7. #include <AK/StringBuilder.h>
  8. #include <AK/Types.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #if ENABLE_UNICODE_DATA
  12. # include <LibUnicode/UnicodeData.h>
  13. #else
  14. # include <AK/CharacterTypes.h>
  15. #endif
  16. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  17. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  18. namespace Unicode {
  19. #if ENABLE_UNICODE_DATA
  20. static bool is_cased_letter(UnicodeData const& unicode_data)
  21. {
  22. // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property
  23. // or has a General_Category value of Titlecase_Letter.
  24. switch (unicode_data.general_category) {
  25. case GeneralCategory::Ll: // FIXME: Should be Ll + Other_Lowercase (PropList.txt).
  26. case GeneralCategory::Lu: // FIXME: Should be Lu + Other_Uppercase (PropList.txt).
  27. case GeneralCategory::Lt:
  28. return true;
  29. default:
  30. return false;
  31. }
  32. }
  33. static bool is_case_ignorable(UnicodeData const& unicode_data)
  34. {
  35. // A character C is defined to be case-ignorable if C has the value MidLetter (ML),
  36. // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is
  37. // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
  38. // Modifier_Symbol (Sk).
  39. switch (unicode_data.general_category) {
  40. case GeneralCategory::Mn:
  41. case GeneralCategory::Me:
  42. case GeneralCategory::Cf:
  43. case GeneralCategory::Lm:
  44. case GeneralCategory::Sk:
  45. return true;
  46. default:
  47. // FIXME: Handle word break properties (auxiliary/WordBreakProperty.txt).
  48. return false;
  49. }
  50. }
  51. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  52. {
  53. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  54. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  55. // characters and then a cased letter.
  56. auto preceding_view = string.substring_view(0, index);
  57. auto following_view = ((index + byte_length) < string.byte_length())
  58. ? string.substring_view(index + byte_length)
  59. : Utf8View {};
  60. size_t cased_letter_count = 0;
  61. for (auto code_point : preceding_view) {
  62. auto unicode_data = unicode_data_for_code_point(code_point);
  63. if (!unicode_data.has_value())
  64. return false;
  65. if (is_cased_letter(*unicode_data))
  66. ++cased_letter_count;
  67. else if (!is_case_ignorable(*unicode_data))
  68. cased_letter_count = 0;
  69. }
  70. if (cased_letter_count == 0)
  71. return false;
  72. for (auto code_point : following_view) {
  73. auto unicode_data = unicode_data_for_code_point(code_point);
  74. if (!unicode_data.has_value())
  75. return false;
  76. if (is_case_ignorable(*unicode_data))
  77. continue;
  78. if (is_cased_letter(*unicode_data))
  79. return false;
  80. break;
  81. }
  82. return true;
  83. }
  84. static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
  85. {
  86. for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
  87. auto const* special_casing = unicode_data.special_casing[i];
  88. if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
  89. return special_casing;
  90. // FIXME: Handle locale.
  91. if (special_casing->locale != Locale::None)
  92. continue;
  93. switch (special_casing->condition) {
  94. case Condition::FinalSigma:
  95. if (is_final_code_point(string, index, byte_length))
  96. return special_casing;
  97. break;
  98. default:
  99. break;
  100. }
  101. }
  102. return nullptr;
  103. }
  104. #endif
  105. u32 to_unicode_lowercase(u32 code_point)
  106. {
  107. #if ENABLE_UNICODE_DATA
  108. auto unicode_data = unicode_data_for_code_point(code_point);
  109. if (unicode_data.has_value())
  110. return unicode_data->simple_lowercase_mapping;
  111. return code_point;
  112. #else
  113. return AK::to_ascii_lowercase(code_point);
  114. #endif
  115. }
  116. u32 to_unicode_uppercase(u32 code_point)
  117. {
  118. #if ENABLE_UNICODE_DATA
  119. auto unicode_data = unicode_data_for_code_point(code_point);
  120. if (unicode_data.has_value())
  121. return unicode_data->simple_uppercase_mapping;
  122. return code_point;
  123. #else
  124. return AK::to_ascii_uppercase(code_point);
  125. #endif
  126. }
  127. String to_unicode_lowercase_full(StringView const& string)
  128. {
  129. #if ENABLE_UNICODE_DATA
  130. Utf8View view { string };
  131. StringBuilder builder;
  132. size_t index = 0;
  133. for (auto it = view.begin(); it != view.end(); ++it) {
  134. u32 code_point = *it;
  135. size_t byte_length = it.underlying_code_point_length_in_bytes();
  136. auto unicode_data = unicode_data_for_code_point(code_point);
  137. if (!unicode_data.has_value()) {
  138. builder.append_code_point(code_point);
  139. index += byte_length;
  140. continue;
  141. }
  142. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  143. if (!special_casing) {
  144. builder.append_code_point(unicode_data->simple_lowercase_mapping);
  145. index += byte_length;
  146. continue;
  147. }
  148. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  149. builder.append_code_point(special_casing->lowercase_mapping[i]);
  150. }
  151. return builder.build();
  152. #else
  153. return string.to_lowercase_string();
  154. #endif
  155. }
  156. String to_unicode_uppercase_full(StringView const& string)
  157. {
  158. #if ENABLE_UNICODE_DATA
  159. Utf8View view { string };
  160. StringBuilder builder;
  161. size_t index = 0;
  162. for (auto it = view.begin(); it != view.end(); ++it) {
  163. u32 code_point = *it;
  164. size_t byte_length = it.underlying_code_point_length_in_bytes();
  165. auto unicode_data = unicode_data_for_code_point(code_point);
  166. if (!unicode_data.has_value()) {
  167. builder.append_code_point(code_point);
  168. index += byte_length;
  169. continue;
  170. }
  171. auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
  172. if (!special_casing) {
  173. builder.append_code_point(unicode_data->simple_uppercase_mapping);
  174. index += byte_length;
  175. continue;
  176. }
  177. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  178. builder.append_code_point(special_casing->uppercase_mapping[i]);
  179. }
  180. return builder.build();
  181. #else
  182. return string.to_uppercase_string();
  183. #endif
  184. }
  185. }