CharacterTypes.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf16View.h>
  11. #include <AK/Utf8View.h>
  12. #include <LibUnicode/CharacterTypes.h>
  13. #include <LibUnicode/UnicodeUtils.h>
  14. #if ENABLE_UNICODE_DATA
  15. # include <LibUnicode/UnicodeData.h>
  16. #endif
  17. namespace Unicode {
  18. Optional<DeprecatedString> __attribute__((weak)) code_point_display_name(u32) { return {}; }
  19. Optional<StringView> __attribute__((weak)) code_point_block_display_name(u32) { return {}; }
  20. Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
  21. u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
  22. Span<BlockName const> __attribute__((weak)) block_display_names() { return {}; }
  23. u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
  24. {
  25. return to_ascii_lowercase(code_point);
  26. }
  27. u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
  28. {
  29. return to_ascii_uppercase(code_point);
  30. }
  31. u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point)
  32. {
  33. return to_ascii_uppercase(code_point);
  34. }
  35. ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView string, Optional<StringView> const& locale)
  36. {
  37. StringBuilder builder;
  38. TRY(Detail::build_lowercase_string(Utf8View { string }, builder, locale));
  39. return builder.to_deprecated_string();
  40. }
  41. ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView string, Optional<StringView> const& locale)
  42. {
  43. StringBuilder builder;
  44. TRY(Detail::build_uppercase_string(Utf8View { string }, builder, locale));
  45. return builder.to_deprecated_string();
  46. }
  47. ErrorOr<String> to_unicode_titlecase_full(StringView string, Optional<StringView> const& locale)
  48. {
  49. StringBuilder builder;
  50. TRY(Detail::build_titlecase_string(Utf8View { string }, builder, locale));
  51. return builder.to_string();
  52. }
  53. Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
  54. bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
  55. Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
  56. bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }
  57. bool is_ecma262_property([[maybe_unused]] Property property)
  58. {
  59. #if ENABLE_UNICODE_DATA
  60. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  61. switch (property) {
  62. case Unicode::Property::ASCII:
  63. case Unicode::Property::ASCII_Hex_Digit:
  64. case Unicode::Property::Alphabetic:
  65. case Unicode::Property::Any:
  66. case Unicode::Property::Assigned:
  67. case Unicode::Property::Bidi_Control:
  68. case Unicode::Property::Bidi_Mirrored:
  69. case Unicode::Property::Case_Ignorable:
  70. case Unicode::Property::Cased:
  71. case Unicode::Property::Changes_When_Casefolded:
  72. case Unicode::Property::Changes_When_Casemapped:
  73. case Unicode::Property::Changes_When_Lowercased:
  74. case Unicode::Property::Changes_When_NFKC_Casefolded:
  75. case Unicode::Property::Changes_When_Titlecased:
  76. case Unicode::Property::Changes_When_Uppercased:
  77. case Unicode::Property::Dash:
  78. case Unicode::Property::Default_Ignorable_Code_Point:
  79. case Unicode::Property::Deprecated:
  80. case Unicode::Property::Diacritic:
  81. case Unicode::Property::Emoji:
  82. case Unicode::Property::Emoji_Component:
  83. case Unicode::Property::Emoji_Modifier:
  84. case Unicode::Property::Emoji_Modifier_Base:
  85. case Unicode::Property::Emoji_Presentation:
  86. case Unicode::Property::Extended_Pictographic:
  87. case Unicode::Property::Extender:
  88. case Unicode::Property::Grapheme_Base:
  89. case Unicode::Property::Grapheme_Extend:
  90. case Unicode::Property::Hex_Digit:
  91. case Unicode::Property::IDS_Binary_Operator:
  92. case Unicode::Property::IDS_Trinary_Operator:
  93. case Unicode::Property::ID_Continue:
  94. case Unicode::Property::ID_Start:
  95. case Unicode::Property::Ideographic:
  96. case Unicode::Property::Join_Control:
  97. case Unicode::Property::Logical_Order_Exception:
  98. case Unicode::Property::Lowercase:
  99. case Unicode::Property::Math:
  100. case Unicode::Property::Noncharacter_Code_Point:
  101. case Unicode::Property::Pattern_Syntax:
  102. case Unicode::Property::Pattern_White_Space:
  103. case Unicode::Property::Quotation_Mark:
  104. case Unicode::Property::Radical:
  105. case Unicode::Property::Regional_Indicator:
  106. case Unicode::Property::Sentence_Terminal:
  107. case Unicode::Property::Soft_Dotted:
  108. case Unicode::Property::Terminal_Punctuation:
  109. case Unicode::Property::Unified_Ideograph:
  110. case Unicode::Property::Uppercase:
  111. case Unicode::Property::Variation_Selector:
  112. case Unicode::Property::White_Space:
  113. case Unicode::Property::XID_Continue:
  114. case Unicode::Property::XID_Start:
  115. return true;
  116. default:
  117. return false;
  118. }
  119. #else
  120. return false;
  121. #endif
  122. }
  123. Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
  124. bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
  125. bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
  126. Optional<Block> __attribute__((weak)) block_from_string(StringView) { return {}; }
  127. bool __attribute__((weak)) code_point_has_block(u32, Block) { return {}; }
  128. bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
  129. bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
  130. bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
  131. Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  132. {
  133. #if ENABLE_UNICODE_DATA
  134. using GBP = GraphemeBreakProperty;
  135. Vector<size_t> boundaries;
  136. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  137. if (view.length_in_code_points() == 0)
  138. return boundaries;
  139. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  140. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  141. };
  142. // GB1
  143. boundaries.append(0);
  144. if (view.length_in_code_points() > 1) {
  145. auto it = view.begin();
  146. auto code_point = *it;
  147. u32 next_code_point;
  148. auto current_ri_chain = 0;
  149. auto in_emoji_sequence = false;
  150. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  151. next_code_point = *it;
  152. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  153. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  154. // GB3
  155. if (code_point_is_cr && next_code_point_is_lf)
  156. continue;
  157. // GB4, GB5
  158. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  159. boundaries.append(view.code_unit_offset_of(it));
  160. continue;
  161. }
  162. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  163. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  164. // GB6
  165. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  166. continue;
  167. // GB7
  168. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  169. continue;
  170. // GB8
  171. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  172. continue;
  173. auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
  174. if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
  175. in_emoji_sequence = true;
  176. else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
  177. in_emoji_sequence = false;
  178. // GB9
  179. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  180. continue;
  181. // GB9a
  182. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  183. continue;
  184. // GB9b
  185. if (has_any_gbp(code_point, GBP::Prepend))
  186. continue;
  187. // GB11
  188. if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  189. continue;
  190. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  191. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  192. // GB12, GB13
  193. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  194. continue;
  195. // GB999
  196. boundaries.append(view.code_unit_offset_of(it));
  197. }
  198. }
  199. // GB2
  200. boundaries.append(view.length_in_code_units());
  201. return boundaries;
  202. #else
  203. return {};
  204. #endif
  205. }
  206. template<typename ViewType>
  207. static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
  208. {
  209. #if ENABLE_UNICODE_DATA
  210. using WBP = WordBreakProperty;
  211. Vector<size_t> boundaries;
  212. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  213. if (view.is_empty())
  214. return boundaries;
  215. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  216. return (code_point_has_word_break_property(code_point, properties) || ...);
  217. };
  218. size_t code_unit_length = 0;
  219. size_t code_point_length = 0;
  220. if constexpr (requires { view.byte_length(); }) {
  221. code_unit_length = view.byte_length();
  222. code_point_length = view.length();
  223. } else if constexpr (requires { view.length_in_code_units(); }) {
  224. code_unit_length = view.length_in_code_units();
  225. code_point_length = view.length_in_code_points();
  226. } else {
  227. static_assert(DependentFalse<ViewType>);
  228. }
  229. auto code_unit_offset_of = [&](auto it) {
  230. if constexpr (requires { view.byte_offset_of(it); })
  231. return view.byte_offset_of(it);
  232. else if constexpr (requires { view.code_unit_offset_of(it); })
  233. return view.code_unit_offset_of(it);
  234. VERIFY_NOT_REACHED();
  235. };
  236. // WB1
  237. boundaries.append(0);
  238. if (code_point_length > 1) {
  239. auto it = view.begin();
  240. auto code_point = *it;
  241. u32 next_code_point;
  242. Optional<u32> previous_code_point;
  243. auto current_ri_chain = 0;
  244. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  245. next_code_point = *it;
  246. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  247. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  248. // WB3
  249. if (code_point_is_cr && next_code_point_is_lf)
  250. continue;
  251. // WB3a, WB3b
  252. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  253. boundaries.append(code_unit_offset_of(it));
  254. continue;
  255. }
  256. // WB3c
  257. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  258. continue;
  259. // WB3d
  260. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  261. continue;
  262. // WB4
  263. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  264. continue;
  265. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  266. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  267. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  268. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  269. // WB5
  270. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  271. continue;
  272. Optional<u32> next_next_code_point;
  273. if (it != view.end()) {
  274. auto it_copy = it;
  275. ++it_copy;
  276. if (it_copy != view.end())
  277. next_next_code_point = *it;
  278. }
  279. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  280. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  281. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  282. // WB6
  283. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  284. continue;
  285. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  286. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  287. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  288. // WB7
  289. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  290. continue;
  291. // WB7a
  292. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  293. continue;
  294. // WB7b
  295. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  296. continue;
  297. // WB7c
  298. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  299. continue;
  300. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  301. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  302. // WB8
  303. if (code_point_is_numeric && next_code_point_is_numeric)
  304. continue;
  305. // WB9
  306. if (code_point_is_ah_letter && next_code_point_is_numeric)
  307. continue;
  308. // WB10
  309. if (code_point_is_numeric && next_code_point_is_ah_letter)
  310. continue;
  311. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric);
  312. // WB11
  313. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  314. continue;
  315. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  316. // WB12
  317. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  318. continue;
  319. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  320. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  321. // WB13
  322. if (code_point_is_katakana && next_code_point_is_katakana)
  323. continue;
  324. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  325. // WB13a
  326. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  327. continue;
  328. // WB13b
  329. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  330. continue;
  331. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  332. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  333. // WB15, WB16
  334. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  335. continue;
  336. // WB999
  337. boundaries.append(code_unit_offset_of(it));
  338. }
  339. }
  340. // WB2
  341. boundaries.append(code_unit_length);
  342. return boundaries;
  343. #else
  344. return {};
  345. #endif
  346. }
  347. Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view)
  348. {
  349. return find_word_segmentation_boundaries_impl(view);
  350. }
  351. Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
  352. {
  353. return find_word_segmentation_boundaries_impl(view);
  354. }
  355. Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  356. {
  357. #if ENABLE_UNICODE_DATA
  358. using SBP = SentenceBreakProperty;
  359. Vector<size_t> boundaries;
  360. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  361. if (view.length_in_code_points() == 0)
  362. return boundaries;
  363. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  364. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  365. };
  366. // SB1
  367. boundaries.append(0);
  368. if (view.length_in_code_points() > 1) {
  369. auto it = view.begin();
  370. auto code_point = *it;
  371. u32 next_code_point;
  372. Optional<u32> previous_code_point;
  373. enum class TerminatorSequenceState {
  374. None,
  375. Term,
  376. Close,
  377. Sp
  378. } terminator_sequence_state { TerminatorSequenceState::None };
  379. auto term_was_a_term = false;
  380. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  381. next_code_point = *it;
  382. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  383. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  384. // SB3
  385. if (code_point_is_cr && next_code_point_is_lf)
  386. continue;
  387. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  388. // SB4
  389. if (code_point_is_para_sep) {
  390. boundaries.append(view.code_unit_offset_of(it));
  391. continue;
  392. }
  393. // SB5
  394. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  395. continue;
  396. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  397. // SB6
  398. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  399. continue;
  400. // SB7
  401. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  402. continue;
  403. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  404. terminator_sequence_state = TerminatorSequenceState::Term;
  405. term_was_a_term = code_point_is_a_term;
  406. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  407. terminator_sequence_state = TerminatorSequenceState::Close;
  408. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  409. terminator_sequence_state = TerminatorSequenceState::Sp;
  410. } else {
  411. terminator_sequence_state = TerminatorSequenceState::None;
  412. }
  413. // SB8
  414. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  415. auto it_copy = it;
  416. bool illegal_sequence = false;
  417. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  418. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  419. continue;
  420. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  421. }
  422. if (illegal_sequence)
  423. continue;
  424. }
  425. // SB8a
  426. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  427. continue;
  428. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  429. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  430. // SB9
  431. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  432. continue;
  433. // SB10
  434. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  435. continue;
  436. // SB11
  437. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  438. boundaries.append(view.code_unit_offset_of(it));
  439. // SB998
  440. }
  441. }
  442. // SB2
  443. boundaries.append(view.length_in_code_units());
  444. return boundaries;
  445. #else
  446. return {};
  447. #endif
  448. }
  449. }