CharacterTypes.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf16View.h>
  11. #include <AK/Utf8View.h>
  12. #include <LibUnicode/CharacterTypes.h>
  13. #include <LibUnicode/Locale.h>
  14. #if ENABLE_UNICODE_DATA
  15. # include <LibUnicode/UnicodeData.h>
  16. #endif
  17. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  18. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  19. namespace Unicode {
  20. Optional<String> __attribute__((weak)) code_point_display_name(u32) { return {}; }
  21. Optional<StringView> __attribute__((weak)) code_point_block_display_name(u32) { return {}; }
  22. Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
  23. u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
  24. Span<SpecialCasing const* const> __attribute__((weak)) special_case_mapping(u32) { return {}; }
  25. Span<BlockName const> __attribute__((weak)) block_display_names() { return {}; }
  26. #if ENABLE_UNICODE_DATA
  27. static bool is_after_uppercase_i(Utf8View const& string, size_t index)
  28. {
  29. // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
  30. auto preceding_view = string.substring_view(0, index);
  31. bool found_uppercase_i = false;
  32. // FIXME: Would be better if Utf8View supported reverse iteration.
  33. for (auto code_point : preceding_view) {
  34. if (code_point == 'I') {
  35. found_uppercase_i = true;
  36. continue;
  37. }
  38. u32 combining_class = canonical_combining_class(code_point);
  39. if (combining_class == 0)
  40. found_uppercase_i = false;
  41. else if (combining_class == 230)
  42. found_uppercase_i = false;
  43. }
  44. return found_uppercase_i;
  45. }
  46. static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
  47. {
  48. // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
  49. auto preceding_view = string.substring_view(0, index);
  50. bool found_soft_dotted_code_point = false;
  51. // FIXME: Would be better if Utf8View supported reverse iteration.
  52. for (auto code_point : preceding_view) {
  53. if (code_point_has_property(code_point, Property::Soft_Dotted)) {
  54. found_soft_dotted_code_point = true;
  55. continue;
  56. }
  57. u32 combining_class = canonical_combining_class(code_point);
  58. if (combining_class == 0)
  59. found_soft_dotted_code_point = false;
  60. else if (combining_class == 230)
  61. found_soft_dotted_code_point = false;
  62. }
  63. return found_soft_dotted_code_point;
  64. }
  65. static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
  66. {
  67. // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
  68. // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
  69. // characters and then a cased letter.
  70. auto preceding_view = string.substring_view(0, index);
  71. auto following_view = ((index + byte_length) < string.byte_length())
  72. ? string.substring_view(index + byte_length)
  73. : Utf8View {};
  74. size_t cased_letter_count = 0;
  75. for (auto code_point : preceding_view) {
  76. bool is_cased = code_point_has_property(code_point, Property::Cased);
  77. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  78. if (is_cased && !is_case_ignorable)
  79. ++cased_letter_count;
  80. else if (!is_case_ignorable)
  81. cased_letter_count = 0;
  82. }
  83. if (cased_letter_count == 0)
  84. return false;
  85. for (auto code_point : following_view) {
  86. bool is_cased = code_point_has_property(code_point, Property::Cased);
  87. bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
  88. if (is_case_ignorable)
  89. continue;
  90. if (is_cased)
  91. return false;
  92. break;
  93. }
  94. return true;
  95. }
  96. static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
  97. {
  98. // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
  99. auto following_view = ((index + byte_length) < string.byte_length())
  100. ? string.substring_view(index + byte_length)
  101. : Utf8View {};
  102. for (auto code_point : following_view) {
  103. u32 combining_class = canonical_combining_class(code_point);
  104. if (combining_class == 0)
  105. return false;
  106. if (combining_class == 230)
  107. return true;
  108. }
  109. return false;
  110. }
  111. static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
  112. {
  113. // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
  114. // intervene between the current character and the combining dot above.
  115. auto following_view = ((index + byte_length) < string.byte_length())
  116. ? string.substring_view(index + byte_length)
  117. : Utf8View {};
  118. for (auto code_point : following_view) {
  119. if (code_point == 0x307)
  120. return true;
  121. u32 combining_class = canonical_combining_class(code_point);
  122. if (combining_class == 0)
  123. return false;
  124. if (combining_class == 230)
  125. return false;
  126. }
  127. return false;
  128. }
  129. static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
  130. {
  131. auto requested_locale = Locale::None;
  132. if (locale.has_value()) {
  133. if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
  134. requested_locale = *maybe_locale;
  135. }
  136. auto special_casings = special_case_mapping(code_point);
  137. for (auto const* special_casing : special_casings) {
  138. if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
  139. continue;
  140. switch (special_casing->condition) {
  141. case Condition::None:
  142. return special_casing;
  143. case Condition::AfterI:
  144. if (is_after_uppercase_i(string, index))
  145. return special_casing;
  146. break;
  147. case Condition::AfterSoftDotted:
  148. if (is_after_soft_dotted_code_point(string, index))
  149. return special_casing;
  150. break;
  151. case Condition::FinalSigma:
  152. if (is_final_code_point(string, index, byte_length))
  153. return special_casing;
  154. break;
  155. case Condition::MoreAbove:
  156. if (is_followed_by_combining_class_above(string, index, byte_length))
  157. return special_casing;
  158. break;
  159. case Condition::NotBeforeDot:
  160. if (!is_followed_by_combining_dot_above(string, index, byte_length))
  161. return special_casing;
  162. break;
  163. }
  164. }
  165. return nullptr;
  166. }
  167. #endif
  168. u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
  169. {
  170. return to_ascii_lowercase(code_point);
  171. }
  172. u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
  173. {
  174. return to_ascii_uppercase(code_point);
  175. }
  176. String to_unicode_lowercase_full(StringView string, [[maybe_unused]] Optional<StringView> locale)
  177. {
  178. #if ENABLE_UNICODE_DATA
  179. Utf8View view { string };
  180. StringBuilder builder;
  181. size_t index = 0;
  182. size_t byte_length = 0;
  183. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  184. u32 code_point = *it;
  185. byte_length = it.underlying_code_point_length_in_bytes();
  186. auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length);
  187. if (!special_casing) {
  188. builder.append_code_point(to_unicode_lowercase(code_point));
  189. continue;
  190. }
  191. for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
  192. builder.append_code_point(special_casing->lowercase_mapping[i]);
  193. }
  194. return builder.build();
  195. #else
  196. return string.to_lowercase_string();
  197. #endif
  198. }
  199. String to_unicode_uppercase_full(StringView string, [[maybe_unused]] Optional<StringView> locale)
  200. {
  201. #if ENABLE_UNICODE_DATA
  202. Utf8View view { string };
  203. StringBuilder builder;
  204. size_t index = 0;
  205. size_t byte_length = 0;
  206. for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) {
  207. u32 code_point = *it;
  208. byte_length = it.underlying_code_point_length_in_bytes();
  209. auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length);
  210. if (!special_casing) {
  211. builder.append_code_point(to_unicode_uppercase(code_point));
  212. continue;
  213. }
  214. for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
  215. builder.append_code_point(special_casing->uppercase_mapping[i]);
  216. }
  217. return builder.build();
  218. #else
  219. return string.to_uppercase_string();
  220. #endif
  221. }
  222. Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
  223. bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
  224. Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
  225. bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }
  226. bool is_ecma262_property([[maybe_unused]] Property property)
  227. {
  228. #if ENABLE_UNICODE_DATA
  229. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  230. switch (property) {
  231. case Unicode::Property::ASCII:
  232. case Unicode::Property::ASCII_Hex_Digit:
  233. case Unicode::Property::Alphabetic:
  234. case Unicode::Property::Any:
  235. case Unicode::Property::Assigned:
  236. case Unicode::Property::Bidi_Control:
  237. case Unicode::Property::Bidi_Mirrored:
  238. case Unicode::Property::Case_Ignorable:
  239. case Unicode::Property::Cased:
  240. case Unicode::Property::Changes_When_Casefolded:
  241. case Unicode::Property::Changes_When_Casemapped:
  242. case Unicode::Property::Changes_When_Lowercased:
  243. case Unicode::Property::Changes_When_NFKC_Casefolded:
  244. case Unicode::Property::Changes_When_Titlecased:
  245. case Unicode::Property::Changes_When_Uppercased:
  246. case Unicode::Property::Dash:
  247. case Unicode::Property::Default_Ignorable_Code_Point:
  248. case Unicode::Property::Deprecated:
  249. case Unicode::Property::Diacritic:
  250. case Unicode::Property::Emoji:
  251. case Unicode::Property::Emoji_Component:
  252. case Unicode::Property::Emoji_Modifier:
  253. case Unicode::Property::Emoji_Modifier_Base:
  254. case Unicode::Property::Emoji_Presentation:
  255. case Unicode::Property::Extended_Pictographic:
  256. case Unicode::Property::Extender:
  257. case Unicode::Property::Grapheme_Base:
  258. case Unicode::Property::Grapheme_Extend:
  259. case Unicode::Property::Hex_Digit:
  260. case Unicode::Property::IDS_Binary_Operator:
  261. case Unicode::Property::IDS_Trinary_Operator:
  262. case Unicode::Property::ID_Continue:
  263. case Unicode::Property::ID_Start:
  264. case Unicode::Property::Ideographic:
  265. case Unicode::Property::Join_Control:
  266. case Unicode::Property::Logical_Order_Exception:
  267. case Unicode::Property::Lowercase:
  268. case Unicode::Property::Math:
  269. case Unicode::Property::Noncharacter_Code_Point:
  270. case Unicode::Property::Pattern_Syntax:
  271. case Unicode::Property::Pattern_White_Space:
  272. case Unicode::Property::Quotation_Mark:
  273. case Unicode::Property::Radical:
  274. case Unicode::Property::Regional_Indicator:
  275. case Unicode::Property::Sentence_Terminal:
  276. case Unicode::Property::Soft_Dotted:
  277. case Unicode::Property::Terminal_Punctuation:
  278. case Unicode::Property::Unified_Ideograph:
  279. case Unicode::Property::Uppercase:
  280. case Unicode::Property::Variation_Selector:
  281. case Unicode::Property::White_Space:
  282. case Unicode::Property::XID_Continue:
  283. case Unicode::Property::XID_Start:
  284. return true;
  285. default:
  286. return false;
  287. }
  288. #else
  289. return false;
  290. #endif
  291. }
  292. Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
  293. bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
  294. bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
  295. Optional<Block> __attribute__((weak)) block_from_string(StringView) { return {}; }
  296. bool __attribute__((weak)) code_point_has_block(u32, Block) { return {}; }
  297. bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
  298. bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
  299. bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
  300. Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  301. {
  302. #if ENABLE_UNICODE_DATA
  303. using GBP = GraphemeBreakProperty;
  304. Vector<size_t> boundaries;
  305. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  306. if (view.length_in_code_points() == 0)
  307. return boundaries;
  308. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  309. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  310. };
  311. // GB1
  312. boundaries.append(0);
  313. if (view.length_in_code_points() > 1) {
  314. auto it = view.begin();
  315. auto code_point = *it;
  316. u32 next_code_point;
  317. auto current_ri_chain = 0;
  318. auto in_emoji_sequence = false;
  319. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  320. next_code_point = *it;
  321. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  322. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  323. // GB3
  324. if (code_point_is_cr && next_code_point_is_lf)
  325. continue;
  326. // GB4, GB5
  327. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  328. boundaries.append(view.code_unit_offset_of(it));
  329. continue;
  330. }
  331. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  332. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  333. // GB6
  334. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  335. continue;
  336. // GB7
  337. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  338. continue;
  339. // GB8
  340. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  341. continue;
  342. auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
  343. if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
  344. in_emoji_sequence = true;
  345. else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
  346. in_emoji_sequence = false;
  347. // GB9
  348. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  349. continue;
  350. // GB9a
  351. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  352. continue;
  353. // GB9b
  354. if (has_any_gbp(code_point, GBP::Prepend))
  355. continue;
  356. // GB11
  357. if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  358. continue;
  359. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  360. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  361. // GB12, GB13
  362. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  363. continue;
  364. // GB999
  365. boundaries.append(view.code_unit_offset_of(it));
  366. }
  367. }
  368. // GB2
  369. boundaries.append(view.length_in_code_units());
  370. return boundaries;
  371. #else
  372. return {};
  373. #endif
  374. }
  375. Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  376. {
  377. #if ENABLE_UNICODE_DATA
  378. using WBP = WordBreakProperty;
  379. Vector<size_t> boundaries;
  380. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  381. if (view.length_in_code_points() == 0)
  382. return boundaries;
  383. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  384. return (code_point_has_word_break_property(code_point, properties) || ...);
  385. };
  386. // WB1
  387. boundaries.append(0);
  388. if (view.length_in_code_points() > 1) {
  389. auto it = view.begin();
  390. auto code_point = *it;
  391. u32 next_code_point;
  392. Optional<u32> previous_code_point;
  393. auto current_ri_chain = 0;
  394. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  395. next_code_point = *it;
  396. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  397. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  398. // WB3
  399. if (code_point_is_cr && next_code_point_is_lf)
  400. continue;
  401. // WB3a, WB3b
  402. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  403. boundaries.append(view.code_unit_offset_of(it));
  404. continue;
  405. }
  406. // WB3c
  407. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  408. continue;
  409. // WB3d
  410. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  411. continue;
  412. // WB4
  413. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  414. continue;
  415. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  416. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  417. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  418. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  419. // WB5
  420. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  421. continue;
  422. Optional<u32> next_next_code_point;
  423. if (it != view.end()) {
  424. auto it_copy = it;
  425. ++it_copy;
  426. if (it_copy != view.end())
  427. next_next_code_point = *it;
  428. }
  429. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  430. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  431. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  432. // WB6
  433. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  434. continue;
  435. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  436. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  437. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  438. // WB7
  439. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  440. continue;
  441. // WB7a
  442. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  443. continue;
  444. // WB7b
  445. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  446. continue;
  447. // WB7c
  448. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  449. continue;
  450. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  451. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  452. // WB8
  453. if (code_point_is_numeric && next_code_point_is_numeric)
  454. continue;
  455. // WB9
  456. if (code_point_is_ah_letter && next_code_point_is_numeric)
  457. continue;
  458. // WB10
  459. if (code_point_is_numeric && next_code_point_is_ah_letter)
  460. continue;
  461. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric);
  462. // WB11
  463. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  464. continue;
  465. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  466. // WB12
  467. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  468. continue;
  469. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  470. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  471. // WB13
  472. if (code_point_is_katakana && next_code_point_is_katakana)
  473. continue;
  474. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  475. // WB13a
  476. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  477. continue;
  478. // WB13b
  479. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  480. continue;
  481. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  482. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  483. // WB15, WB16
  484. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  485. continue;
  486. // WB999
  487. boundaries.append(view.code_unit_offset_of(it));
  488. }
  489. }
  490. // WB2
  491. boundaries.append(view.length_in_code_units());
  492. return boundaries;
  493. #else
  494. return {};
  495. #endif
  496. }
  497. Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  498. {
  499. #if ENABLE_UNICODE_DATA
  500. using SBP = SentenceBreakProperty;
  501. Vector<size_t> boundaries;
  502. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  503. if (view.length_in_code_points() == 0)
  504. return boundaries;
  505. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  506. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  507. };
  508. // SB1
  509. boundaries.append(0);
  510. if (view.length_in_code_points() > 1) {
  511. auto it = view.begin();
  512. auto code_point = *it;
  513. u32 next_code_point;
  514. Optional<u32> previous_code_point;
  515. enum class TerminatorSequenceState {
  516. None,
  517. Term,
  518. Close,
  519. Sp
  520. } terminator_sequence_state { TerminatorSequenceState::None };
  521. auto term_was_a_term = false;
  522. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  523. next_code_point = *it;
  524. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  525. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  526. // SB3
  527. if (code_point_is_cr && next_code_point_is_lf)
  528. continue;
  529. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  530. // SB4
  531. if (code_point_is_para_sep) {
  532. boundaries.append(view.code_unit_offset_of(it));
  533. continue;
  534. }
  535. // SB5
  536. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  537. continue;
  538. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  539. // SB6
  540. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  541. continue;
  542. // SB7
  543. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  544. continue;
  545. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  546. terminator_sequence_state = TerminatorSequenceState::Term;
  547. term_was_a_term = code_point_is_a_term;
  548. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  549. terminator_sequence_state = TerminatorSequenceState::Close;
  550. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  551. terminator_sequence_state = TerminatorSequenceState::Sp;
  552. } else {
  553. terminator_sequence_state = TerminatorSequenceState::None;
  554. }
  555. // SB8
  556. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  557. auto it_copy = it;
  558. bool illegal_sequence = false;
  559. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  560. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  561. continue;
  562. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  563. }
  564. if (illegal_sequence)
  565. continue;
  566. }
  567. // SB8a
  568. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  569. continue;
  570. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  571. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  572. // SB9
  573. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  574. continue;
  575. // SB10
  576. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  577. continue;
  578. // SB11
  579. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  580. boundaries.append(view.code_unit_offset_of(it));
  581. // SB998
  582. }
  583. }
  584. // SB2
  585. boundaries.append(view.length_in_code_units());
  586. return boundaries;
  587. #else
  588. return {};
  589. #endif
  590. }
  591. }