Segmentation.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Utf16View.h>
  8. #include <AK/Utf32View.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/Segmentation.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. namespace Unicode {
  16. template<typename ViewType>
  17. static size_t code_unit_length(ViewType const& view)
  18. {
  19. if constexpr (IsSame<ViewType, Utf8View>)
  20. return view.byte_length();
  21. else if constexpr (IsSame<ViewType, Utf16View>)
  22. return view.length_in_code_units();
  23. else if constexpr (IsSame<ViewType, Utf32View>)
  24. return view.length();
  25. else
  26. static_assert(DependentFalse<ViewType>);
  27. }
  28. template<typename ViewType, typename CodeUnitIterator>
  29. static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
  30. {
  31. if constexpr (IsSame<ViewType, Utf8View>)
  32. return view.byte_offset_of(it);
  33. else if constexpr (IsSame<ViewType, Utf16View>)
  34. return view.code_unit_offset_of(it);
  35. else if constexpr (IsSame<ViewType, Utf32View>)
  36. return view.iterator_offset(it);
  37. else
  38. static_assert(DependentFalse<ViewType>);
  39. }
  40. template<typename ViewType>
  41. static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  42. {
  43. #if ENABLE_UNICODE_DATA
  44. using GBP = GraphemeBreakProperty;
  45. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  46. if (view.is_empty())
  47. return;
  48. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  49. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  50. };
  51. auto skip_incb_extend_linker_sequence = [&](auto& it) {
  52. while (true) {
  53. if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
  54. return;
  55. auto next_it = it;
  56. ++next_it;
  57. if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
  58. return;
  59. it = next_it;
  60. ++it;
  61. }
  62. };
  63. // GB1
  64. if (callback(0) == IterationDecision::Break)
  65. return;
  66. if (code_unit_length(view) > 1) {
  67. auto it = view.begin();
  68. auto code_point = *it;
  69. u32 next_code_point = 0;
  70. auto current_ri_chain = 0;
  71. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  72. next_code_point = *it;
  73. // GB9c
  74. if (code_point_has_property(code_point, Property::InCB_Consonant)) {
  75. auto it_copy = it;
  76. skip_incb_extend_linker_sequence(it_copy);
  77. if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
  78. ++it_copy;
  79. skip_incb_extend_linker_sequence(it_copy);
  80. if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
  81. next_code_point = *it_copy;
  82. it = it_copy;
  83. continue;
  84. }
  85. }
  86. }
  87. // GB11
  88. if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
  89. auto it_copy = it;
  90. while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
  91. ++it_copy;
  92. if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
  93. ++it_copy;
  94. if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
  95. next_code_point = *it_copy;
  96. it = it_copy;
  97. continue;
  98. }
  99. }
  100. }
  101. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  102. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  103. // GB3
  104. if (code_point_is_cr && next_code_point_is_lf)
  105. continue;
  106. // GB4, GB5
  107. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  108. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  109. return;
  110. continue;
  111. }
  112. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  113. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  114. // GB6
  115. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  116. continue;
  117. // GB7
  118. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  119. continue;
  120. // GB8
  121. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  122. continue;
  123. // GB9
  124. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  125. continue;
  126. // GB9a
  127. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  128. continue;
  129. // GB9b
  130. if (has_any_gbp(code_point, GBP::Prepend))
  131. continue;
  132. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  133. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  134. // GB12, GB13
  135. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  136. continue;
  137. // GB999
  138. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  139. return;
  140. }
  141. }
  142. // GB2
  143. callback(code_unit_length(view));
  144. #endif
  145. }
  146. void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  147. {
  148. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  149. }
  150. void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  151. {
  152. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  153. }
  154. void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  155. {
  156. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  157. }
  158. template<typename ViewType>
  159. static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  160. {
  161. #if ENABLE_UNICODE_DATA
  162. using WBP = WordBreakProperty;
  163. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  164. if (view.is_empty())
  165. return;
  166. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  167. return (code_point_has_word_break_property(code_point, properties) || ...);
  168. };
  169. // WB1
  170. if (callback(0) == IterationDecision::Break)
  171. return;
  172. if (code_unit_length(view) > 1) {
  173. auto it = view.begin();
  174. auto code_point = *it;
  175. u32 next_code_point;
  176. Optional<u32> previous_code_point;
  177. auto current_ri_chain = 0;
  178. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  179. next_code_point = *it;
  180. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  181. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  182. // WB3
  183. if (code_point_is_cr && next_code_point_is_lf)
  184. continue;
  185. // WB3a, WB3b
  186. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  187. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  188. return;
  189. continue;
  190. }
  191. // WB3c
  192. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  193. continue;
  194. // WB3d
  195. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  196. continue;
  197. // WB4
  198. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  199. continue;
  200. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  201. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  202. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  203. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  204. // WB5
  205. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  206. continue;
  207. Optional<u32> next_next_code_point;
  208. if (it != view.end()) {
  209. auto it_copy = it;
  210. ++it_copy;
  211. if (it_copy != view.end())
  212. next_next_code_point = *it_copy;
  213. }
  214. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  215. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  216. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  217. // WB6
  218. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  219. continue;
  220. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  221. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  222. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  223. // WB7
  224. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  225. continue;
  226. // WB7a
  227. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  228. continue;
  229. // WB7b
  230. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  231. continue;
  232. // WB7c
  233. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  234. continue;
  235. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  236. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  237. // WB8
  238. if (code_point_is_numeric && next_code_point_is_numeric)
  239. continue;
  240. // WB9
  241. if (code_point_is_ah_letter && next_code_point_is_numeric)
  242. continue;
  243. // WB10
  244. if (code_point_is_numeric && next_code_point_is_ah_letter)
  245. continue;
  246. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
  247. // WB11
  248. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  249. continue;
  250. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  251. // WB12
  252. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  253. continue;
  254. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  255. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  256. // WB13
  257. if (code_point_is_katakana && next_code_point_is_katakana)
  258. continue;
  259. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  260. // WB13a
  261. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  262. continue;
  263. // WB13b
  264. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  265. continue;
  266. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  267. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  268. // WB15, WB16
  269. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  270. continue;
  271. // WB999
  272. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  273. return;
  274. }
  275. }
  276. // WB2
  277. callback(code_unit_length(view));
  278. #endif
  279. }
  280. void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  281. {
  282. for_each_word_segmentation_boundary_impl(view, move(callback));
  283. }
  284. void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  285. {
  286. for_each_word_segmentation_boundary_impl(view, move(callback));
  287. }
  288. void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  289. {
  290. for_each_word_segmentation_boundary_impl(view, move(callback));
  291. }
  292. template<typename ViewType>
  293. static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  294. {
  295. #if ENABLE_UNICODE_DATA
  296. using SBP = SentenceBreakProperty;
  297. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  298. if (view.is_empty())
  299. return;
  300. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  301. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  302. };
  303. // SB1
  304. if (callback(0) == IterationDecision::Break)
  305. return;
  306. if (code_unit_length(view) > 1) {
  307. auto it = view.begin();
  308. auto code_point = *it;
  309. u32 next_code_point;
  310. Optional<u32> previous_code_point;
  311. enum class TerminatorSequenceState {
  312. None,
  313. Term,
  314. Close,
  315. Sp
  316. } terminator_sequence_state { TerminatorSequenceState::None };
  317. auto term_was_a_term = false;
  318. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  319. next_code_point = *it;
  320. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  321. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  322. // SB3
  323. if (code_point_is_cr && next_code_point_is_lf)
  324. continue;
  325. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  326. // SB4
  327. if (code_point_is_para_sep) {
  328. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  329. return;
  330. continue;
  331. }
  332. // SB5
  333. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  334. continue;
  335. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  336. // SB6
  337. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  338. continue;
  339. // SB7
  340. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  341. continue;
  342. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  343. terminator_sequence_state = TerminatorSequenceState::Term;
  344. term_was_a_term = code_point_is_a_term;
  345. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  346. terminator_sequence_state = TerminatorSequenceState::Close;
  347. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  348. terminator_sequence_state = TerminatorSequenceState::Sp;
  349. } else {
  350. terminator_sequence_state = TerminatorSequenceState::None;
  351. }
  352. // SB8
  353. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  354. auto it_copy = it;
  355. bool illegal_sequence = false;
  356. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  357. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  358. continue;
  359. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  360. }
  361. if (illegal_sequence)
  362. continue;
  363. }
  364. // SB8a
  365. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  366. continue;
  367. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  368. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  369. // SB9
  370. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  371. continue;
  372. // SB10
  373. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  374. continue;
  375. // SB11
  376. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  377. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  378. return;
  379. // SB998
  380. }
  381. }
  382. // SB2
  383. callback(code_unit_length(view));
  384. #endif
  385. }
  386. void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  387. {
  388. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  389. }
  390. void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  391. {
  392. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  393. }
  394. void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  395. {
  396. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  397. }
  398. }