Segmentation.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Utf16View.h>
  8. #include <AK/Utf32View.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. #include <LibUnicode/Segmentation.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/UnicodeData.h>
  14. #endif
  15. namespace Unicode {
  16. template<typename ViewType>
  17. static size_t code_unit_length(ViewType const& view)
  18. {
  19. if constexpr (IsSame<ViewType, Utf8View>)
  20. return view.byte_length();
  21. else if constexpr (IsSame<ViewType, Utf16View>)
  22. return view.length_in_code_units();
  23. else if constexpr (IsSame<ViewType, Utf32View>)
  24. return view.length();
  25. else
  26. static_assert(DependentFalse<ViewType>);
  27. }
  28. template<typename ViewType, typename CodeUnitIterator>
  29. static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
  30. {
  31. if constexpr (IsSame<ViewType, Utf8View>)
  32. return view.byte_offset_of(it);
  33. else if constexpr (IsSame<ViewType, Utf16View>)
  34. return view.code_unit_offset_of(it);
  35. else if constexpr (IsSame<ViewType, Utf32View>)
  36. return view.iterator_offset(it);
  37. else
  38. static_assert(DependentFalse<ViewType>);
  39. }
  40. template<typename ViewType>
  41. static Vector<size_t> find_grapheme_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
  42. {
  43. #if ENABLE_UNICODE_DATA
  44. using GBP = GraphemeBreakProperty;
  45. Vector<size_t> boundaries;
  46. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  47. if (view.is_empty())
  48. return boundaries;
  49. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  50. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  51. };
  52. // GB1
  53. boundaries.append(0);
  54. if (code_unit_length(view) > 1) {
  55. auto it = view.begin();
  56. auto code_point = *it;
  57. u32 next_code_point;
  58. auto current_ri_chain = 0;
  59. auto in_emoji_sequence = false;
  60. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  61. next_code_point = *it;
  62. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  63. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  64. // GB3
  65. if (code_point_is_cr && next_code_point_is_lf)
  66. continue;
  67. // GB4, GB5
  68. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  69. boundaries.append(code_unit_offset_of(view, it));
  70. continue;
  71. }
  72. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  73. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  74. // GB6
  75. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  76. continue;
  77. // GB7
  78. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  79. continue;
  80. // GB8
  81. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  82. continue;
  83. auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
  84. if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
  85. in_emoji_sequence = true;
  86. else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
  87. in_emoji_sequence = false;
  88. // GB9
  89. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  90. continue;
  91. // GB9a
  92. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  93. continue;
  94. // GB9b
  95. if (has_any_gbp(code_point, GBP::Prepend))
  96. continue;
  97. // GB11
  98. if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  99. continue;
  100. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  101. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  102. // GB12, GB13
  103. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  104. continue;
  105. // GB999
  106. boundaries.append(code_unit_offset_of(view, it));
  107. }
  108. }
  109. // GB2
  110. boundaries.append(code_unit_length(view));
  111. return boundaries;
  112. #else
  113. return {};
  114. #endif
  115. }
  116. Vector<size_t> find_grapheme_segmentation_boundaries(Utf8View const& view)
  117. {
  118. return find_grapheme_segmentation_boundaries_impl(view);
  119. }
  120. Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const& view)
  121. {
  122. return find_grapheme_segmentation_boundaries_impl(view);
  123. }
  124. Vector<size_t> find_grapheme_segmentation_boundaries(Utf32View const& view)
  125. {
  126. return find_grapheme_segmentation_boundaries_impl(view);
  127. }
  128. template<typename ViewType>
  129. static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
  130. {
  131. #if ENABLE_UNICODE_DATA
  132. using WBP = WordBreakProperty;
  133. Vector<size_t> boundaries;
  134. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  135. if (view.is_empty())
  136. return boundaries;
  137. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  138. return (code_point_has_word_break_property(code_point, properties) || ...);
  139. };
  140. // WB1
  141. boundaries.append(0);
  142. if (code_unit_length(view) > 1) {
  143. auto it = view.begin();
  144. auto code_point = *it;
  145. u32 next_code_point;
  146. Optional<u32> previous_code_point;
  147. auto current_ri_chain = 0;
  148. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  149. next_code_point = *it;
  150. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  151. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  152. // WB3
  153. if (code_point_is_cr && next_code_point_is_lf)
  154. continue;
  155. // WB3a, WB3b
  156. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  157. boundaries.append(code_unit_offset_of(view, it));
  158. continue;
  159. }
  160. // WB3c
  161. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  162. continue;
  163. // WB3d
  164. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  165. continue;
  166. // WB4
  167. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  168. continue;
  169. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  170. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  171. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  172. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  173. // WB5
  174. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  175. continue;
  176. Optional<u32> next_next_code_point;
  177. if (it != view.end()) {
  178. auto it_copy = it;
  179. ++it_copy;
  180. if (it_copy != view.end())
  181. next_next_code_point = *it;
  182. }
  183. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  184. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  185. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  186. // WB6
  187. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  188. continue;
  189. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  190. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  191. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  192. // WB7
  193. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  194. continue;
  195. // WB7a
  196. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  197. continue;
  198. // WB7b
  199. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  200. continue;
  201. // WB7c
  202. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  203. continue;
  204. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  205. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  206. // WB8
  207. if (code_point_is_numeric && next_code_point_is_numeric)
  208. continue;
  209. // WB9
  210. if (code_point_is_ah_letter && next_code_point_is_numeric)
  211. continue;
  212. // WB10
  213. if (code_point_is_numeric && next_code_point_is_ah_letter)
  214. continue;
  215. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric);
  216. // WB11
  217. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  218. continue;
  219. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  220. // WB12
  221. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  222. continue;
  223. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  224. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  225. // WB13
  226. if (code_point_is_katakana && next_code_point_is_katakana)
  227. continue;
  228. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  229. // WB13a
  230. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  231. continue;
  232. // WB13b
  233. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  234. continue;
  235. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  236. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  237. // WB15, WB16
  238. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  239. continue;
  240. // WB999
  241. boundaries.append(code_unit_offset_of(view, it));
  242. }
  243. }
  244. // WB2
  245. boundaries.append(code_unit_length(view));
  246. return boundaries;
  247. #else
  248. return {};
  249. #endif
  250. }
  251. Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view)
  252. {
  253. return find_word_segmentation_boundaries_impl(view);
  254. }
  255. Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
  256. {
  257. return find_word_segmentation_boundaries_impl(view);
  258. }
  259. Vector<size_t> find_word_segmentation_boundaries(Utf32View const& view)
  260. {
  261. return find_word_segmentation_boundaries_impl(view);
  262. }
  263. template<typename ViewType>
  264. static Vector<size_t> find_sentence_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
  265. {
  266. #if ENABLE_UNICODE_DATA
  267. using SBP = SentenceBreakProperty;
  268. Vector<size_t> boundaries;
  269. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  270. if (view.is_empty())
  271. return boundaries;
  272. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  273. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  274. };
  275. // SB1
  276. boundaries.append(0);
  277. if (code_unit_length(view) > 1) {
  278. auto it = view.begin();
  279. auto code_point = *it;
  280. u32 next_code_point;
  281. Optional<u32> previous_code_point;
  282. enum class TerminatorSequenceState {
  283. None,
  284. Term,
  285. Close,
  286. Sp
  287. } terminator_sequence_state { TerminatorSequenceState::None };
  288. auto term_was_a_term = false;
  289. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  290. next_code_point = *it;
  291. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  292. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  293. // SB3
  294. if (code_point_is_cr && next_code_point_is_lf)
  295. continue;
  296. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  297. // SB4
  298. if (code_point_is_para_sep) {
  299. boundaries.append(code_unit_offset_of(view, it));
  300. continue;
  301. }
  302. // SB5
  303. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  304. continue;
  305. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  306. // SB6
  307. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  308. continue;
  309. // SB7
  310. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  311. continue;
  312. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  313. terminator_sequence_state = TerminatorSequenceState::Term;
  314. term_was_a_term = code_point_is_a_term;
  315. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  316. terminator_sequence_state = TerminatorSequenceState::Close;
  317. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  318. terminator_sequence_state = TerminatorSequenceState::Sp;
  319. } else {
  320. terminator_sequence_state = TerminatorSequenceState::None;
  321. }
  322. // SB8
  323. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  324. auto it_copy = it;
  325. bool illegal_sequence = false;
  326. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  327. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  328. continue;
  329. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  330. }
  331. if (illegal_sequence)
  332. continue;
  333. }
  334. // SB8a
  335. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  336. continue;
  337. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  338. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  339. // SB9
  340. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  341. continue;
  342. // SB10
  343. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  344. continue;
  345. // SB11
  346. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  347. boundaries.append(code_unit_offset_of(view, it));
  348. // SB998
  349. }
  350. }
  351. // SB2
  352. boundaries.append(code_unit_length(view));
  353. return boundaries;
  354. #else
  355. return {};
  356. #endif
  357. }
  358. Vector<size_t> find_sentence_segmentation_boundaries(Utf8View const& view)
  359. {
  360. return find_sentence_segmentation_boundaries_impl(view);
  361. }
  362. Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const& view)
  363. {
  364. return find_sentence_segmentation_boundaries_impl(view);
  365. }
  366. Vector<size_t> find_sentence_segmentation_boundaries(Utf32View const& view)
  367. {
  368. return find_sentence_segmentation_boundaries_impl(view);
  369. }
  370. }