Segmentation.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Utf16View.h>
  8. #include <AK/Utf32View.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/Segmentation.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. namespace Unicode {
  16. template<typename ViewType>
  17. static size_t code_unit_length(ViewType const& view)
  18. {
  19. if constexpr (IsSame<ViewType, Utf8View>)
  20. return view.byte_length();
  21. else if constexpr (IsSame<ViewType, Utf16View>)
  22. return view.length_in_code_units();
  23. else if constexpr (IsSame<ViewType, Utf32View>)
  24. return view.length();
  25. else
  26. static_assert(DependentFalse<ViewType>);
  27. }
  28. template<typename ViewType, typename CodeUnitIterator>
  29. static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
  30. {
  31. if constexpr (IsSame<ViewType, Utf8View>)
  32. return view.byte_offset_of(it);
  33. else if constexpr (IsSame<ViewType, Utf16View>)
  34. return view.code_unit_offset_of(it);
  35. else if constexpr (IsSame<ViewType, Utf32View>)
  36. return view.iterator_offset(it);
  37. else
  38. static_assert(DependentFalse<ViewType>);
  39. }
  40. template<typename ViewType>
  41. static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  42. {
  43. #if ENABLE_UNICODE_DATA
  44. using GBP = GraphemeBreakProperty;
  45. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  46. if (view.is_empty())
  47. return;
  48. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  49. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  50. };
  51. // GB1
  52. if (callback(0) == IterationDecision::Break)
  53. return;
  54. if (code_unit_length(view) > 1) {
  55. auto it = view.begin();
  56. auto code_point = *it;
  57. u32 next_code_point;
  58. auto current_ri_chain = 0;
  59. auto in_emoji_sequence = false;
  60. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  61. next_code_point = *it;
  62. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  63. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  64. // GB3
  65. if (code_point_is_cr && next_code_point_is_lf)
  66. continue;
  67. // GB4, GB5
  68. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  69. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  70. return;
  71. continue;
  72. }
  73. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  74. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  75. // GB6
  76. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  77. continue;
  78. // GB7
  79. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  80. continue;
  81. // GB8
  82. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  83. continue;
  84. auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
  85. if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
  86. in_emoji_sequence = true;
  87. else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
  88. in_emoji_sequence = false;
  89. // GB9
  90. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  91. continue;
  92. // GB9a
  93. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  94. continue;
  95. // GB9b
  96. if (has_any_gbp(code_point, GBP::Prepend))
  97. continue;
  98. // GB11
  99. if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  100. continue;
  101. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  102. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  103. // GB12, GB13
  104. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  105. continue;
  106. // GB999
  107. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  108. return;
  109. }
  110. }
  111. // GB2
  112. callback(code_unit_length(view));
  113. #endif
  114. }
  115. void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  116. {
  117. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  118. }
  119. void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  120. {
  121. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  122. }
  123. void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  124. {
  125. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  126. }
  127. template<typename ViewType>
  128. static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  129. {
  130. #if ENABLE_UNICODE_DATA
  131. using WBP = WordBreakProperty;
  132. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  133. if (view.is_empty())
  134. return;
  135. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  136. return (code_point_has_word_break_property(code_point, properties) || ...);
  137. };
  138. // WB1
  139. if (callback(0) == IterationDecision::Break)
  140. return;
  141. if (code_unit_length(view) > 1) {
  142. auto it = view.begin();
  143. auto code_point = *it;
  144. u32 next_code_point;
  145. Optional<u32> previous_code_point;
  146. auto current_ri_chain = 0;
  147. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  148. next_code_point = *it;
  149. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  150. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  151. // WB3
  152. if (code_point_is_cr && next_code_point_is_lf)
  153. continue;
  154. // WB3a, WB3b
  155. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  156. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  157. return;
  158. continue;
  159. }
  160. // WB3c
  161. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  162. continue;
  163. // WB3d
  164. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  165. continue;
  166. // WB4
  167. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  168. continue;
  169. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  170. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  171. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  172. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  173. // WB5
  174. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  175. continue;
  176. Optional<u32> next_next_code_point;
  177. if (it != view.end()) {
  178. auto it_copy = it;
  179. ++it_copy;
  180. if (it_copy != view.end())
  181. next_next_code_point = *it_copy;
  182. }
  183. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  184. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  185. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  186. // WB6
  187. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  188. continue;
  189. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  190. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  191. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  192. // WB7
  193. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  194. continue;
  195. // WB7a
  196. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  197. continue;
  198. // WB7b
  199. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  200. continue;
  201. // WB7c
  202. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  203. continue;
  204. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  205. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  206. // WB8
  207. if (code_point_is_numeric && next_code_point_is_numeric)
  208. continue;
  209. // WB9
  210. if (code_point_is_ah_letter && next_code_point_is_numeric)
  211. continue;
  212. // WB10
  213. if (code_point_is_numeric && next_code_point_is_ah_letter)
  214. continue;
  215. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
  216. // WB11
  217. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  218. continue;
  219. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  220. // WB12
  221. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  222. continue;
  223. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  224. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  225. // WB13
  226. if (code_point_is_katakana && next_code_point_is_katakana)
  227. continue;
  228. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  229. // WB13a
  230. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  231. continue;
  232. // WB13b
  233. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  234. continue;
  235. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  236. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  237. // WB15, WB16
  238. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  239. continue;
  240. // WB999
  241. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  242. return;
  243. }
  244. }
  245. // WB2
  246. callback(code_unit_length(view));
  247. #endif
  248. }
  249. void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  250. {
  251. for_each_word_segmentation_boundary_impl(view, move(callback));
  252. }
  253. void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  254. {
  255. for_each_word_segmentation_boundary_impl(view, move(callback));
  256. }
  257. void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  258. {
  259. for_each_word_segmentation_boundary_impl(view, move(callback));
  260. }
  261. template<typename ViewType>
  262. static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  263. {
  264. #if ENABLE_UNICODE_DATA
  265. using SBP = SentenceBreakProperty;
  266. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  267. if (view.is_empty())
  268. return;
  269. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  270. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  271. };
  272. // SB1
  273. if (callback(0) == IterationDecision::Break)
  274. return;
  275. if (code_unit_length(view) > 1) {
  276. auto it = view.begin();
  277. auto code_point = *it;
  278. u32 next_code_point;
  279. Optional<u32> previous_code_point;
  280. enum class TerminatorSequenceState {
  281. None,
  282. Term,
  283. Close,
  284. Sp
  285. } terminator_sequence_state { TerminatorSequenceState::None };
  286. auto term_was_a_term = false;
  287. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  288. next_code_point = *it;
  289. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  290. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  291. // SB3
  292. if (code_point_is_cr && next_code_point_is_lf)
  293. continue;
  294. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  295. // SB4
  296. if (code_point_is_para_sep) {
  297. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  298. return;
  299. continue;
  300. }
  301. // SB5
  302. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  303. continue;
  304. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  305. // SB6
  306. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  307. continue;
  308. // SB7
  309. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  310. continue;
  311. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  312. terminator_sequence_state = TerminatorSequenceState::Term;
  313. term_was_a_term = code_point_is_a_term;
  314. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  315. terminator_sequence_state = TerminatorSequenceState::Close;
  316. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  317. terminator_sequence_state = TerminatorSequenceState::Sp;
  318. } else {
  319. terminator_sequence_state = TerminatorSequenceState::None;
  320. }
  321. // SB8
  322. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  323. auto it_copy = it;
  324. bool illegal_sequence = false;
  325. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  326. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  327. continue;
  328. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  329. }
  330. if (illegal_sequence)
  331. continue;
  332. }
  333. // SB8a
  334. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  335. continue;
  336. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  337. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  338. // SB9
  339. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  340. continue;
  341. // SB10
  342. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  343. continue;
  344. // SB11
  345. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  346. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  347. return;
  348. // SB998
  349. }
  350. }
  351. // SB2
  352. callback(code_unit_length(view));
  353. #endif
  354. }
  355. void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  356. {
  357. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  358. }
  359. void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  360. {
  361. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  362. }
  363. void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  364. {
  365. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  366. }
  367. }