Segmentation.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Utf16View.h>
  8. #include <AK/Utf32View.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/Segmentation.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. namespace Unicode {
  16. template<typename ViewType>
  17. static size_t code_unit_length(ViewType const& view)
  18. {
  19. if constexpr (IsSame<ViewType, Utf8View>)
  20. return view.byte_length();
  21. else if constexpr (IsSame<ViewType, Utf16View>)
  22. return view.length_in_code_units();
  23. else if constexpr (IsSame<ViewType, Utf32View>)
  24. return view.length();
  25. else
  26. static_assert(DependentFalse<ViewType>);
  27. }
  28. template<typename ViewType, typename CodeUnitIterator>
  29. static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
  30. {
  31. if constexpr (IsSame<ViewType, Utf8View>)
  32. return view.byte_offset_of(it);
  33. else if constexpr (IsSame<ViewType, Utf16View>)
  34. return view.code_unit_offset_of(it);
  35. else if constexpr (IsSame<ViewType, Utf32View>)
  36. return view.iterator_offset(it);
  37. else
  38. static_assert(DependentFalse<ViewType>);
  39. }
  40. template<typename ViewType>
  41. static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  42. {
  43. #if ENABLE_UNICODE_DATA
  44. using GBP = GraphemeBreakProperty;
  45. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  46. if (view.is_empty())
  47. return;
  48. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  49. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  50. };
  51. // GB1
  52. if (callback(0) == IterationDecision::Break)
  53. return;
  54. if (code_unit_length(view) > 1) {
  55. auto it = view.begin();
  56. auto code_point = *it;
  57. u32 next_code_point = 0;
  58. auto current_ri_chain = 0;
  59. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  60. next_code_point = *it;
  61. // GB11
  62. if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
  63. auto it_copy = it;
  64. while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
  65. ++it_copy;
  66. if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
  67. ++it_copy;
  68. if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
  69. next_code_point = *it_copy;
  70. it = it_copy;
  71. continue;
  72. }
  73. }
  74. }
  75. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  76. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  77. // GB3
  78. if (code_point_is_cr && next_code_point_is_lf)
  79. continue;
  80. // GB4, GB5
  81. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  82. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  83. return;
  84. continue;
  85. }
  86. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  87. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  88. // GB6
  89. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  90. continue;
  91. // GB7
  92. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  93. continue;
  94. // GB8
  95. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  96. continue;
  97. // GB9
  98. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  99. continue;
  100. // GB9a
  101. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  102. continue;
  103. // GB9b
  104. if (has_any_gbp(code_point, GBP::Prepend))
  105. continue;
  106. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  107. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  108. // GB12, GB13
  109. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  110. continue;
  111. // GB999
  112. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  113. return;
  114. }
  115. }
  116. // GB2
  117. callback(code_unit_length(view));
  118. #endif
  119. }
  120. void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  121. {
  122. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  123. }
  124. void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  125. {
  126. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  127. }
  128. void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  129. {
  130. for_each_grapheme_segmentation_boundary_impl(view, move(callback));
  131. }
  132. template<typename ViewType>
  133. static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  134. {
  135. #if ENABLE_UNICODE_DATA
  136. using WBP = WordBreakProperty;
  137. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  138. if (view.is_empty())
  139. return;
  140. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  141. return (code_point_has_word_break_property(code_point, properties) || ...);
  142. };
  143. // WB1
  144. if (callback(0) == IterationDecision::Break)
  145. return;
  146. if (code_unit_length(view) > 1) {
  147. auto it = view.begin();
  148. auto code_point = *it;
  149. u32 next_code_point;
  150. Optional<u32> previous_code_point;
  151. auto current_ri_chain = 0;
  152. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  153. next_code_point = *it;
  154. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  155. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  156. // WB3
  157. if (code_point_is_cr && next_code_point_is_lf)
  158. continue;
  159. // WB3a, WB3b
  160. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  161. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  162. return;
  163. continue;
  164. }
  165. // WB3c
  166. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  167. continue;
  168. // WB3d
  169. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  170. continue;
  171. // WB4
  172. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  173. continue;
  174. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  175. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  176. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  177. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  178. // WB5
  179. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  180. continue;
  181. Optional<u32> next_next_code_point;
  182. if (it != view.end()) {
  183. auto it_copy = it;
  184. ++it_copy;
  185. if (it_copy != view.end())
  186. next_next_code_point = *it_copy;
  187. }
  188. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  189. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  190. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  191. // WB6
  192. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  193. continue;
  194. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  195. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  196. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  197. // WB7
  198. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  199. continue;
  200. // WB7a
  201. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  202. continue;
  203. // WB7b
  204. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  205. continue;
  206. // WB7c
  207. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  208. continue;
  209. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  210. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  211. // WB8
  212. if (code_point_is_numeric && next_code_point_is_numeric)
  213. continue;
  214. // WB9
  215. if (code_point_is_ah_letter && next_code_point_is_numeric)
  216. continue;
  217. // WB10
  218. if (code_point_is_numeric && next_code_point_is_ah_letter)
  219. continue;
  220. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
  221. // WB11
  222. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  223. continue;
  224. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  225. // WB12
  226. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  227. continue;
  228. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  229. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  230. // WB13
  231. if (code_point_is_katakana && next_code_point_is_katakana)
  232. continue;
  233. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  234. // WB13a
  235. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  236. continue;
  237. // WB13b
  238. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  239. continue;
  240. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  241. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  242. // WB15, WB16
  243. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  244. continue;
  245. // WB999
  246. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  247. return;
  248. }
  249. }
  250. // WB2
  251. callback(code_unit_length(view));
  252. #endif
  253. }
  254. void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  255. {
  256. for_each_word_segmentation_boundary_impl(view, move(callback));
  257. }
  258. void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  259. {
  260. for_each_word_segmentation_boundary_impl(view, move(callback));
  261. }
  262. void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  263. {
  264. for_each_word_segmentation_boundary_impl(view, move(callback));
  265. }
  266. template<typename ViewType>
  267. static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
  268. {
  269. #if ENABLE_UNICODE_DATA
  270. using SBP = SentenceBreakProperty;
  271. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  272. if (view.is_empty())
  273. return;
  274. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  275. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  276. };
  277. // SB1
  278. if (callback(0) == IterationDecision::Break)
  279. return;
  280. if (code_unit_length(view) > 1) {
  281. auto it = view.begin();
  282. auto code_point = *it;
  283. u32 next_code_point;
  284. Optional<u32> previous_code_point;
  285. enum class TerminatorSequenceState {
  286. None,
  287. Term,
  288. Close,
  289. Sp
  290. } terminator_sequence_state { TerminatorSequenceState::None };
  291. auto term_was_a_term = false;
  292. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  293. next_code_point = *it;
  294. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  295. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  296. // SB3
  297. if (code_point_is_cr && next_code_point_is_lf)
  298. continue;
  299. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  300. // SB4
  301. if (code_point_is_para_sep) {
  302. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  303. return;
  304. continue;
  305. }
  306. // SB5
  307. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  308. continue;
  309. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  310. // SB6
  311. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  312. continue;
  313. // SB7
  314. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  315. continue;
  316. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  317. terminator_sequence_state = TerminatorSequenceState::Term;
  318. term_was_a_term = code_point_is_a_term;
  319. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  320. terminator_sequence_state = TerminatorSequenceState::Close;
  321. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  322. terminator_sequence_state = TerminatorSequenceState::Sp;
  323. } else {
  324. terminator_sequence_state = TerminatorSequenceState::None;
  325. }
  326. // SB8
  327. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  328. auto it_copy = it;
  329. bool illegal_sequence = false;
  330. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  331. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  332. continue;
  333. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  334. }
  335. if (illegal_sequence)
  336. continue;
  337. }
  338. // SB8a
  339. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  340. continue;
  341. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  342. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  343. // SB9
  344. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  345. continue;
  346. // SB10
  347. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  348. continue;
  349. // SB11
  350. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  351. if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
  352. return;
  353. // SB998
  354. }
  355. }
  356. // SB2
  357. callback(code_unit_length(view));
  358. #endif
  359. }
  360. void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
  361. {
  362. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  363. }
  364. void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
  365. {
  366. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  367. }
  368. void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
  369. {
  370. for_each_sentence_segmentation_boundary_impl(view, move(callback));
  371. }
  372. }