Segmentation.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Utf16View.h>
  8. #include <AK/Utf8View.h>
  9. #include <LibUnicode/CharacterTypes.h>
  10. #include <LibUnicode/Segmentation.h>
  11. #if ENABLE_UNICODE_DATA
  12. # include <LibUnicode/UnicodeData.h>
  13. #endif
  14. namespace Unicode {
  15. Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  16. {
  17. #if ENABLE_UNICODE_DATA
  18. using GBP = GraphemeBreakProperty;
  19. Vector<size_t> boundaries;
  20. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  21. if (view.length_in_code_points() == 0)
  22. return boundaries;
  23. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  24. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  25. };
  26. // GB1
  27. boundaries.append(0);
  28. if (view.length_in_code_points() > 1) {
  29. auto it = view.begin();
  30. auto code_point = *it;
  31. u32 next_code_point;
  32. auto current_ri_chain = 0;
  33. auto in_emoji_sequence = false;
  34. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  35. next_code_point = *it;
  36. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  37. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  38. // GB3
  39. if (code_point_is_cr && next_code_point_is_lf)
  40. continue;
  41. // GB4, GB5
  42. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  43. boundaries.append(view.code_unit_offset_of(it));
  44. continue;
  45. }
  46. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  47. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  48. // GB6
  49. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  50. continue;
  51. // GB7
  52. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  53. continue;
  54. // GB8
  55. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  56. continue;
  57. auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
  58. if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
  59. in_emoji_sequence = true;
  60. else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
  61. in_emoji_sequence = false;
  62. // GB9
  63. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  64. continue;
  65. // GB9a
  66. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  67. continue;
  68. // GB9b
  69. if (has_any_gbp(code_point, GBP::Prepend))
  70. continue;
  71. // GB11
  72. if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  73. continue;
  74. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  75. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  76. // GB12, GB13
  77. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  78. continue;
  79. // GB999
  80. boundaries.append(view.code_unit_offset_of(it));
  81. }
  82. }
  83. // GB2
  84. boundaries.append(view.length_in_code_units());
  85. return boundaries;
  86. #else
  87. return {};
  88. #endif
  89. }
  90. template<typename ViewType>
  91. static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
  92. {
  93. #if ENABLE_UNICODE_DATA
  94. using WBP = WordBreakProperty;
  95. Vector<size_t> boundaries;
  96. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  97. if (view.is_empty())
  98. return boundaries;
  99. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  100. return (code_point_has_word_break_property(code_point, properties) || ...);
  101. };
  102. size_t code_unit_length = 0;
  103. size_t code_point_length = 0;
  104. if constexpr (requires { view.byte_length(); }) {
  105. code_unit_length = view.byte_length();
  106. code_point_length = view.length();
  107. } else if constexpr (requires { view.length_in_code_units(); }) {
  108. code_unit_length = view.length_in_code_units();
  109. code_point_length = view.length_in_code_points();
  110. } else {
  111. static_assert(DependentFalse<ViewType>);
  112. }
  113. auto code_unit_offset_of = [&](auto it) {
  114. if constexpr (requires { view.byte_offset_of(it); })
  115. return view.byte_offset_of(it);
  116. else if constexpr (requires { view.code_unit_offset_of(it); })
  117. return view.code_unit_offset_of(it);
  118. VERIFY_NOT_REACHED();
  119. };
  120. // WB1
  121. boundaries.append(0);
  122. if (code_point_length > 1) {
  123. auto it = view.begin();
  124. auto code_point = *it;
  125. u32 next_code_point;
  126. Optional<u32> previous_code_point;
  127. auto current_ri_chain = 0;
  128. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  129. next_code_point = *it;
  130. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  131. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  132. // WB3
  133. if (code_point_is_cr && next_code_point_is_lf)
  134. continue;
  135. // WB3a, WB3b
  136. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  137. boundaries.append(code_unit_offset_of(it));
  138. continue;
  139. }
  140. // WB3c
  141. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  142. continue;
  143. // WB3d
  144. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  145. continue;
  146. // WB4
  147. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  148. continue;
  149. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  150. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  151. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  152. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  153. // WB5
  154. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  155. continue;
  156. Optional<u32> next_next_code_point;
  157. if (it != view.end()) {
  158. auto it_copy = it;
  159. ++it_copy;
  160. if (it_copy != view.end())
  161. next_next_code_point = *it;
  162. }
  163. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  164. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  165. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  166. // WB6
  167. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  168. continue;
  169. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  170. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  171. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  172. // WB7
  173. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  174. continue;
  175. // WB7a
  176. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  177. continue;
  178. // WB7b
  179. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  180. continue;
  181. // WB7c
  182. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  183. continue;
  184. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  185. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  186. // WB8
  187. if (code_point_is_numeric && next_code_point_is_numeric)
  188. continue;
  189. // WB9
  190. if (code_point_is_ah_letter && next_code_point_is_numeric)
  191. continue;
  192. // WB10
  193. if (code_point_is_numeric && next_code_point_is_ah_letter)
  194. continue;
  195. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric);
  196. // WB11
  197. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  198. continue;
  199. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  200. // WB12
  201. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  202. continue;
  203. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  204. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  205. // WB13
  206. if (code_point_is_katakana && next_code_point_is_katakana)
  207. continue;
  208. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  209. // WB13a
  210. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  211. continue;
  212. // WB13b
  213. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  214. continue;
  215. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  216. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  217. // WB15, WB16
  218. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  219. continue;
  220. // WB999
  221. boundaries.append(code_unit_offset_of(it));
  222. }
  223. }
  224. // WB2
  225. boundaries.append(code_unit_length);
  226. return boundaries;
  227. #else
  228. return {};
  229. #endif
  230. }
  231. Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view)
  232. {
  233. return find_word_segmentation_boundaries_impl(view);
  234. }
  235. Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
  236. {
  237. return find_word_segmentation_boundaries_impl(view);
  238. }
  239. Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  240. {
  241. #if ENABLE_UNICODE_DATA
  242. using SBP = SentenceBreakProperty;
  243. Vector<size_t> boundaries;
  244. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  245. if (view.length_in_code_points() == 0)
  246. return boundaries;
  247. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  248. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  249. };
  250. // SB1
  251. boundaries.append(0);
  252. if (view.length_in_code_points() > 1) {
  253. auto it = view.begin();
  254. auto code_point = *it;
  255. u32 next_code_point;
  256. Optional<u32> previous_code_point;
  257. enum class TerminatorSequenceState {
  258. None,
  259. Term,
  260. Close,
  261. Sp
  262. } terminator_sequence_state { TerminatorSequenceState::None };
  263. auto term_was_a_term = false;
  264. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  265. next_code_point = *it;
  266. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  267. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  268. // SB3
  269. if (code_point_is_cr && next_code_point_is_lf)
  270. continue;
  271. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  272. // SB4
  273. if (code_point_is_para_sep) {
  274. boundaries.append(view.code_unit_offset_of(it));
  275. continue;
  276. }
  277. // SB5
  278. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  279. continue;
  280. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  281. // SB6
  282. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  283. continue;
  284. // SB7
  285. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  286. continue;
  287. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  288. terminator_sequence_state = TerminatorSequenceState::Term;
  289. term_was_a_term = code_point_is_a_term;
  290. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  291. terminator_sequence_state = TerminatorSequenceState::Close;
  292. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  293. terminator_sequence_state = TerminatorSequenceState::Sp;
  294. } else {
  295. terminator_sequence_state = TerminatorSequenceState::None;
  296. }
  297. // SB8
  298. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  299. auto it_copy = it;
  300. bool illegal_sequence = false;
  301. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  302. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  303. continue;
  304. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  305. }
  306. if (illegal_sequence)
  307. continue;
  308. }
  309. // SB8a
  310. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  311. continue;
  312. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  313. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  314. // SB9
  315. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  316. continue;
  317. // SB10
  318. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  319. continue;
  320. // SB11
  321. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  322. boundaries.append(view.code_unit_offset_of(it));
  323. // SB998
  324. }
  325. }
  326. // SB2
  327. boundaries.append(view.length_in_code_units());
  328. return boundaries;
  329. #else
  330. return {};
  331. #endif
  332. }
  333. }