CharacterTypes.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Platform.h>
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf16View.h>
  11. #include <AK/Utf8View.h>
  12. #include <LibUnicode/CharacterTypes.h>
  13. #include <LibUnicode/UnicodeUtils.h>
  14. #if ENABLE_UNICODE_DATA
  15. # include <LibUnicode/UnicodeData.h>
  16. #endif
  17. // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
  18. // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
  19. namespace Unicode {
  20. Optional<DeprecatedString> __attribute__((weak)) code_point_display_name(u32) { return {}; }
  21. Optional<StringView> __attribute__((weak)) code_point_block_display_name(u32) { return {}; }
  22. Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
  23. u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
  24. Span<BlockName const> __attribute__((weak)) block_display_names() { return {}; }
  25. u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
  26. {
  27. return to_ascii_lowercase(code_point);
  28. }
  29. u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
  30. {
  31. return to_ascii_uppercase(code_point);
  32. }
  33. ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView string, Optional<StringView> const& locale)
  34. {
  35. StringBuilder builder;
  36. TRY(Detail::build_lowercase_string(Utf8View { string }, builder, locale));
  37. return builder.to_deprecated_string();
  38. }
  39. ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView string, Optional<StringView> const& locale)
  40. {
  41. StringBuilder builder;
  42. TRY(Detail::build_uppercase_string(Utf8View { string }, builder, locale));
  43. return builder.to_deprecated_string();
  44. }
  45. Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
  46. bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
  47. Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
  48. bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }
  49. bool is_ecma262_property([[maybe_unused]] Property property)
  50. {
  51. #if ENABLE_UNICODE_DATA
  52. // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
  53. switch (property) {
  54. case Unicode::Property::ASCII:
  55. case Unicode::Property::ASCII_Hex_Digit:
  56. case Unicode::Property::Alphabetic:
  57. case Unicode::Property::Any:
  58. case Unicode::Property::Assigned:
  59. case Unicode::Property::Bidi_Control:
  60. case Unicode::Property::Bidi_Mirrored:
  61. case Unicode::Property::Case_Ignorable:
  62. case Unicode::Property::Cased:
  63. case Unicode::Property::Changes_When_Casefolded:
  64. case Unicode::Property::Changes_When_Casemapped:
  65. case Unicode::Property::Changes_When_Lowercased:
  66. case Unicode::Property::Changes_When_NFKC_Casefolded:
  67. case Unicode::Property::Changes_When_Titlecased:
  68. case Unicode::Property::Changes_When_Uppercased:
  69. case Unicode::Property::Dash:
  70. case Unicode::Property::Default_Ignorable_Code_Point:
  71. case Unicode::Property::Deprecated:
  72. case Unicode::Property::Diacritic:
  73. case Unicode::Property::Emoji:
  74. case Unicode::Property::Emoji_Component:
  75. case Unicode::Property::Emoji_Modifier:
  76. case Unicode::Property::Emoji_Modifier_Base:
  77. case Unicode::Property::Emoji_Presentation:
  78. case Unicode::Property::Extended_Pictographic:
  79. case Unicode::Property::Extender:
  80. case Unicode::Property::Grapheme_Base:
  81. case Unicode::Property::Grapheme_Extend:
  82. case Unicode::Property::Hex_Digit:
  83. case Unicode::Property::IDS_Binary_Operator:
  84. case Unicode::Property::IDS_Trinary_Operator:
  85. case Unicode::Property::ID_Continue:
  86. case Unicode::Property::ID_Start:
  87. case Unicode::Property::Ideographic:
  88. case Unicode::Property::Join_Control:
  89. case Unicode::Property::Logical_Order_Exception:
  90. case Unicode::Property::Lowercase:
  91. case Unicode::Property::Math:
  92. case Unicode::Property::Noncharacter_Code_Point:
  93. case Unicode::Property::Pattern_Syntax:
  94. case Unicode::Property::Pattern_White_Space:
  95. case Unicode::Property::Quotation_Mark:
  96. case Unicode::Property::Radical:
  97. case Unicode::Property::Regional_Indicator:
  98. case Unicode::Property::Sentence_Terminal:
  99. case Unicode::Property::Soft_Dotted:
  100. case Unicode::Property::Terminal_Punctuation:
  101. case Unicode::Property::Unified_Ideograph:
  102. case Unicode::Property::Uppercase:
  103. case Unicode::Property::Variation_Selector:
  104. case Unicode::Property::White_Space:
  105. case Unicode::Property::XID_Continue:
  106. case Unicode::Property::XID_Start:
  107. return true;
  108. default:
  109. return false;
  110. }
  111. #else
  112. return false;
  113. #endif
  114. }
  115. Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
  116. bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
  117. bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
  118. Optional<Block> __attribute__((weak)) block_from_string(StringView) { return {}; }
  119. bool __attribute__((weak)) code_point_has_block(u32, Block) { return {}; }
  120. bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
  121. bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
  122. bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
  123. Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  124. {
  125. #if ENABLE_UNICODE_DATA
  126. using GBP = GraphemeBreakProperty;
  127. Vector<size_t> boundaries;
  128. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  129. if (view.length_in_code_points() == 0)
  130. return boundaries;
  131. auto has_any_gbp = [](u32 code_point, auto&&... properties) {
  132. return (code_point_has_grapheme_break_property(code_point, properties) || ...);
  133. };
  134. // GB1
  135. boundaries.append(0);
  136. if (view.length_in_code_points() > 1) {
  137. auto it = view.begin();
  138. auto code_point = *it;
  139. u32 next_code_point;
  140. auto current_ri_chain = 0;
  141. auto in_emoji_sequence = false;
  142. for (++it; it != view.end(); ++it, code_point = next_code_point) {
  143. next_code_point = *it;
  144. auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
  145. auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
  146. // GB3
  147. if (code_point_is_cr && next_code_point_is_lf)
  148. continue;
  149. // GB4, GB5
  150. if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
  151. boundaries.append(view.code_unit_offset_of(it));
  152. continue;
  153. }
  154. auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
  155. auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
  156. // GB6
  157. if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
  158. continue;
  159. // GB7
  160. if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
  161. continue;
  162. // GB8
  163. if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
  164. continue;
  165. auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
  166. if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
  167. in_emoji_sequence = true;
  168. else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
  169. in_emoji_sequence = false;
  170. // GB9
  171. if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
  172. continue;
  173. // GB9a
  174. if (has_any_gbp(next_code_point, GBP::SpacingMark))
  175. continue;
  176. // GB9b
  177. if (has_any_gbp(code_point, GBP::Prepend))
  178. continue;
  179. // GB11
  180. if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  181. continue;
  182. auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
  183. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  184. // GB12, GB13
  185. if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  186. continue;
  187. // GB999
  188. boundaries.append(view.code_unit_offset_of(it));
  189. }
  190. }
  191. // GB2
  192. boundaries.append(view.length_in_code_units());
  193. return boundaries;
  194. #else
  195. return {};
  196. #endif
  197. }
  198. Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  199. {
  200. #if ENABLE_UNICODE_DATA
  201. using WBP = WordBreakProperty;
  202. Vector<size_t> boundaries;
  203. // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
  204. if (view.length_in_code_points() == 0)
  205. return boundaries;
  206. auto has_any_wbp = [](u32 code_point, auto&&... properties) {
  207. return (code_point_has_word_break_property(code_point, properties) || ...);
  208. };
  209. // WB1
  210. boundaries.append(0);
  211. if (view.length_in_code_points() > 1) {
  212. auto it = view.begin();
  213. auto code_point = *it;
  214. u32 next_code_point;
  215. Optional<u32> previous_code_point;
  216. auto current_ri_chain = 0;
  217. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  218. next_code_point = *it;
  219. auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
  220. auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
  221. // WB3
  222. if (code_point_is_cr && next_code_point_is_lf)
  223. continue;
  224. // WB3a, WB3b
  225. if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
  226. boundaries.append(view.code_unit_offset_of(it));
  227. continue;
  228. }
  229. // WB3c
  230. if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
  231. continue;
  232. // WB3d
  233. if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
  234. continue;
  235. // WB4
  236. if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
  237. continue;
  238. auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
  239. auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
  240. auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
  241. auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
  242. // WB5
  243. if (code_point_is_ah_letter && next_code_point_is_ah_letter)
  244. continue;
  245. Optional<u32> next_next_code_point;
  246. if (it != view.end()) {
  247. auto it_copy = it;
  248. ++it_copy;
  249. if (it_copy != view.end())
  250. next_next_code_point = *it;
  251. }
  252. bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
  253. bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
  254. auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
  255. // WB6
  256. if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
  257. continue;
  258. auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
  259. auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
  260. auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
  261. // WB7
  262. if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
  263. continue;
  264. // WB7a
  265. if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
  266. continue;
  267. // WB7b
  268. if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
  269. continue;
  270. // WB7c
  271. if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
  272. continue;
  273. auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
  274. auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
  275. // WB8
  276. if (code_point_is_numeric && next_code_point_is_numeric)
  277. continue;
  278. // WB9
  279. if (code_point_is_ah_letter && next_code_point_is_numeric)
  280. continue;
  281. // WB10
  282. if (code_point_is_numeric && next_code_point_is_ah_letter)
  283. continue;
  284. auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric);
  285. // WB11
  286. if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
  287. continue;
  288. bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
  289. // WB12
  290. if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
  291. continue;
  292. auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
  293. auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
  294. // WB13
  295. if (code_point_is_katakana && next_code_point_is_katakana)
  296. continue;
  297. auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
  298. // WB13a
  299. if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
  300. continue;
  301. // WB13b
  302. if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
  303. continue;
  304. auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
  305. current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
  306. // WB15, WB16
  307. if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
  308. continue;
  309. // WB999
  310. boundaries.append(view.code_unit_offset_of(it));
  311. }
  312. }
  313. // WB2
  314. boundaries.append(view.length_in_code_units());
  315. return boundaries;
  316. #else
  317. return {};
  318. #endif
  319. }
  320. Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
  321. {
  322. #if ENABLE_UNICODE_DATA
  323. using SBP = SentenceBreakProperty;
  324. Vector<size_t> boundaries;
  325. // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  326. if (view.length_in_code_points() == 0)
  327. return boundaries;
  328. auto has_any_sbp = [](u32 code_point, auto&&... properties) {
  329. return (code_point_has_sentence_break_property(code_point, properties) || ...);
  330. };
  331. // SB1
  332. boundaries.append(0);
  333. if (view.length_in_code_points() > 1) {
  334. auto it = view.begin();
  335. auto code_point = *it;
  336. u32 next_code_point;
  337. Optional<u32> previous_code_point;
  338. enum class TerminatorSequenceState {
  339. None,
  340. Term,
  341. Close,
  342. Sp
  343. } terminator_sequence_state { TerminatorSequenceState::None };
  344. auto term_was_a_term = false;
  345. for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
  346. next_code_point = *it;
  347. auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
  348. auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
  349. // SB3
  350. if (code_point_is_cr && next_code_point_is_lf)
  351. continue;
  352. auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
  353. // SB4
  354. if (code_point_is_para_sep) {
  355. boundaries.append(view.code_unit_offset_of(it));
  356. continue;
  357. }
  358. // SB5
  359. if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
  360. continue;
  361. auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
  362. // SB6
  363. if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
  364. continue;
  365. // SB7
  366. if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
  367. continue;
  368. if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
  369. terminator_sequence_state = TerminatorSequenceState::Term;
  370. term_was_a_term = code_point_is_a_term;
  371. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
  372. terminator_sequence_state = TerminatorSequenceState::Close;
  373. } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
  374. terminator_sequence_state = TerminatorSequenceState::Sp;
  375. } else {
  376. terminator_sequence_state = TerminatorSequenceState::None;
  377. }
  378. // SB8
  379. if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
  380. auto it_copy = it;
  381. bool illegal_sequence = false;
  382. for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
  383. if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
  384. continue;
  385. illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
  386. }
  387. if (illegal_sequence)
  388. continue;
  389. }
  390. // SB8a
  391. if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
  392. continue;
  393. auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
  394. auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
  395. // SB9
  396. if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
  397. continue;
  398. // SB10
  399. if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
  400. continue;
  401. // SB11
  402. if (terminator_sequence_state >= TerminatorSequenceState::Term)
  403. boundaries.append(view.code_unit_offset_of(it));
  404. // SB998
  405. }
  406. }
  407. // SB2
  408. boundaries.append(view.length_in_code_units());
  409. return boundaries;
  410. #else
  411. return {};
  412. #endif
  413. }
  414. }