Normalize.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. /*
  2. * Copyright (c) 2022, mat
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Find.h>
  7. #include <AK/QuickSort.h>
  8. #include <AK/Utf8View.h>
  9. #include <AK/Vector.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/Normalize.h>
  12. #include <LibUnicode/UnicodeData.h>
  13. namespace Unicode {
  14. Optional<CodePointDecomposition const&> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
  15. Span<CodePointDecomposition const> __attribute__((weak)) code_point_decompositions() { return {}; }
  16. NormalizationForm normalization_form_from_string(StringView form)
  17. {
  18. if (form == "NFD"sv)
  19. return NormalizationForm::NFD;
  20. if (form == "NFC"sv)
  21. return NormalizationForm::NFC;
  22. if (form == "NFKD"sv)
  23. return NormalizationForm::NFKD;
  24. if (form == "NFKC"sv)
  25. return NormalizationForm::NFKC;
  26. VERIFY_NOT_REACHED();
  27. }
  28. StringView normalization_form_to_string(NormalizationForm form)
  29. {
  30. switch (form) {
  31. case NormalizationForm::NFD:
  32. return "NFD"sv;
  33. case NormalizationForm::NFC:
  34. return "NFC"sv;
  35. case NormalizationForm::NFKD:
  36. return "NFKD"sv;
  37. case NormalizationForm::NFKC:
  38. return "NFKC"sv;
  39. }
  40. VERIFY_NOT_REACHED();
  41. }
  42. ALWAYS_INLINE static bool is_starter(u32 code_point)
  43. {
  44. return Unicode::canonical_combining_class(code_point) == 0;
  45. }
  46. // From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
  47. static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
  48. static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
  49. static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
  50. static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
  51. static constexpr u32 HANGUL_LEADING_COUNT = 19;
  52. static constexpr u32 HANGUL_VOWEL_COUNT = 21;
  53. static constexpr u32 HANGUL_TRAILING_COUNT = 28;
  54. // NCount in the standard.
  55. static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
  56. static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;
  57. ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
  58. {
  59. return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
  60. }
  61. ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
  62. {
  63. return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
  64. }
  65. ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
  66. {
  67. return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
  68. }
  69. ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
  70. {
  71. return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
  72. }
  73. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
  74. static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
  75. {
  76. auto const index = code_point - HANGUL_SYLLABLE_BASE;
  77. auto const leading_index = index / HANGUL_BLOCK_COUNT;
  78. auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
  79. auto const trailing_index = index % HANGUL_TRAILING_COUNT;
  80. auto const leading_part = HANGUL_LEADING_BASE + leading_index;
  81. auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
  82. auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
  83. code_points_output.append(leading_part);
  84. code_points_output.append(vowel_part);
  85. if (trailing_index != 0)
  86. code_points_output.append(trailing_part);
  87. }
  88. // L, V and LV, T Hangul Syllable Composition
  89. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
  90. static u32 combine_hangul_code_points(u32 a, u32 b)
  91. {
  92. if (is_hangul_leading(a) && is_hangul_vowel(b)) {
  93. auto const leading_index = a - HANGUL_LEADING_BASE;
  94. auto const vowel_index = b - HANGUL_VOWEL_BASE;
  95. auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
  96. return HANGUL_SYLLABLE_BASE + leading_vowel_index;
  97. }
  98. // LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
  99. if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
  100. return a + b - HANGUL_TRAILING_BASE;
  101. }
  102. return 0;
  103. }
  104. static u32 combine_code_points(u32 a, u32 b)
  105. {
  106. Array<u32, 2> const points { a, b };
  107. // FIXME: Do something better than linear search to find reverse mappings.
  108. for (auto const& mapping : Unicode::code_point_decompositions()) {
  109. if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) {
  110. if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion))
  111. continue;
  112. return mapping.code_point;
  113. }
  114. }
  115. return 0;
  116. }
  117. enum class UseCompatibility {
  118. Yes,
  119. No
  120. };
  121. static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, UseCompatibility use_compatibility)
  122. {
  123. if (is_hangul_code_point(code_point)) {
  124. decompose_hangul_code_point(code_point, code_points_output);
  125. return;
  126. }
  127. auto const mapping = Unicode::code_point_decomposition(code_point);
  128. if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
  129. for (auto code_point : mapping->decomposition) {
  130. decompose_code_point(code_point, code_points_output, use_compatibility);
  131. }
  132. } else {
  133. code_points_output.append(code_point);
  134. }
  135. }
  136. // This can be any sorting algorithm that maintains order (like std::stable_sort),
  137. // however bubble sort is easier to implement, so go with it (for now).
  138. template<typename T, typename LessThan>
  139. void bubble_sort(Span<T> span, LessThan less_than)
  140. {
  141. for (size_t i = 0; i < span.size() - 1; ++i) {
  142. for (size_t j = 0; j < span.size() - 1 - i; ++j) {
  143. if (!less_than(span[j], span[j + 1]))
  144. swap(span[j], span[j + 1]);
  145. }
  146. }
  147. }
  148. // The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
  149. // See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
  150. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
  151. static void canonical_ordering_algorithm(Span<u32> code_points)
  152. {
  153. for (size_t i = 0; i < code_points.size(); ++i) {
  154. if (!is_starter(code_points[i])) {
  155. auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
  156. auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
  157. // Nothing to reorder, so continue.
  158. if (span_size <= 1)
  159. continue;
  160. Span<u32> const span { code_points.data() + i, span_size };
  161. bubble_sort(span, [](u32 a, u32 b) {
  162. // Use <= to keep ordering.
  163. return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
  164. });
  165. // Skip over span we just sorted.
  166. i += span_size - 1;
  167. }
  168. }
  169. }
  170. // See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
  171. static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
  172. {
  173. if (!is_starter(code_points[a]) || a == c - 1)
  174. return false;
  175. auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
  176. auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
  177. return b_combining_class == 0 || b_combining_class >= c_combining_class;
  178. }
  179. // The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
  180. // See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
  181. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
  182. static void canonical_composition_algorithm(Vector<u32>& code_points)
  183. {
  184. for (size_t i = 1; i < code_points.size(); ++i) {
  185. auto const current_character = code_points[i];
  186. // R1. Seek back (left) to find the last Starter L preceding C in the character sequence
  187. for (ssize_t j = i - 1; j >= 0; --j) {
  188. if (!is_starter(code_points[j]))
  189. continue;
  190. // R2. If there is such an L, and C is not blocked from L,
  191. // and there exists a Primary Composite P which is canonically equivalent to <L, C>,
  192. // then replace L by P in the sequence and delete C from the sequence.
  193. if (is_blocked(code_points.span(), j, i))
  194. continue;
  195. auto composite = combine_hangul_code_points(code_points[j], current_character);
  196. if (composite == 0)
  197. composite = combine_code_points(code_points[j], current_character);
  198. if (composite != 0) {
  199. code_points[j] = composite;
  200. code_points.remove(i);
  201. --i;
  202. break;
  203. }
  204. }
  205. }
  206. }
  207. static Vector<u32> normalize_nfd(Utf8View string)
  208. {
  209. Vector<u32> result;
  210. for (auto const code_point : string) {
  211. decompose_code_point(code_point, result, UseCompatibility::No);
  212. }
  213. canonical_ordering_algorithm(result);
  214. return result;
  215. }
  216. static Vector<u32> normalize_nfc(Utf8View string)
  217. {
  218. auto result = normalize_nfd(string);
  219. canonical_composition_algorithm(result);
  220. return result;
  221. }
  222. static Vector<u32> normalize_nfkd(Utf8View string)
  223. {
  224. Vector<u32> result;
  225. for (auto const code_point : string) {
  226. decompose_code_point(code_point, result, UseCompatibility::Yes);
  227. }
  228. canonical_ordering_algorithm(result);
  229. return result;
  230. }
  231. static Vector<u32> normalize_nfkc(Utf8View string)
  232. {
  233. auto result = normalize_nfkd(string);
  234. canonical_composition_algorithm(result);
  235. return result;
  236. }
  237. static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
  238. {
  239. switch (form) {
  240. case NormalizationForm::NFD:
  241. return normalize_nfd(string);
  242. case NormalizationForm::NFC:
  243. return normalize_nfc(string);
  244. case NormalizationForm::NFKD:
  245. return normalize_nfkd(string);
  246. case NormalizationForm::NFKC:
  247. return normalize_nfkc(string);
  248. }
  249. VERIFY_NOT_REACHED();
  250. }
  251. String normalize(StringView string, NormalizationForm form)
  252. {
  253. Utf8View const view { string };
  254. auto const code_points = normalize_implementation(view, form);
  255. StringBuilder builder;
  256. for (auto code_point : code_points) {
  257. builder.append_code_point(code_point);
  258. }
  259. return builder.to_string();
  260. }
  261. }