123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491 |
- /*
- * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
- * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #include <AK/Utf16View.h>
- #include <AK/Utf32View.h>
- #include <AK/Utf8View.h>
- #include <LibUnicode/CharacterTypes.h>
- #include <LibUnicode/Segmentation.h>
- #if ENABLE_UNICODE_DATA
- # include <LibUnicode/UnicodeData.h>
- #endif
- namespace Unicode {
- template<typename ViewType>
- static size_t code_unit_length(ViewType const& view)
- {
- if constexpr (IsSame<ViewType, Utf8View>)
- return view.byte_length();
- else if constexpr (IsSame<ViewType, Utf16View>)
- return view.length_in_code_units();
- else if constexpr (IsSame<ViewType, Utf32View>)
- return view.length();
- else
- static_assert(DependentFalse<ViewType>);
- }
- template<typename ViewType, typename CodeUnitIterator>
- static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
- {
- if constexpr (IsSame<ViewType, Utf8View>)
- return view.byte_offset_of(it);
- else if constexpr (IsSame<ViewType, Utf16View>)
- return view.code_unit_offset_of(it);
- else if constexpr (IsSame<ViewType, Utf32View>)
- return view.iterator_offset(it);
- else
- static_assert(DependentFalse<ViewType>);
- }
- template<typename ViewType>
- static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
- {
- #if ENABLE_UNICODE_DATA
- using GBP = GraphemeBreakProperty;
- // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- if (view.is_empty())
- return;
- auto has_any_gbp = [](u32 code_point, auto&&... properties) {
- return (code_point_has_grapheme_break_property(code_point, properties) || ...);
- };
- auto skip_incb_extend_linker_sequence = [&](auto& it) {
- while (true) {
- if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
- return;
- auto next_it = it;
- ++next_it;
- if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
- return;
- it = next_it;
- ++it;
- }
- };
- // GB1
- if (callback(0) == IterationDecision::Break)
- return;
- if (code_unit_length(view) > 1) {
- auto it = view.begin();
- auto code_point = *it;
- u32 next_code_point = 0;
- auto current_ri_chain = 0;
- for (++it; it != view.end(); ++it, code_point = next_code_point) {
- next_code_point = *it;
- // GB9c
- if (code_point_has_property(code_point, Property::InCB_Consonant)) {
- auto it_copy = it;
- skip_incb_extend_linker_sequence(it_copy);
- if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
- ++it_copy;
- skip_incb_extend_linker_sequence(it_copy);
- if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
- next_code_point = *it_copy;
- it = it_copy;
- continue;
- }
- }
- }
- // GB11
- if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
- auto it_copy = it;
- while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
- ++it_copy;
- if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
- ++it_copy;
- if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
- next_code_point = *it_copy;
- it = it_copy;
- continue;
- }
- }
- }
- auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
- auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
- // GB3
- if (code_point_is_cr && next_code_point_is_lf)
- continue;
- // GB4, GB5
- if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
- if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
- return;
- continue;
- }
- auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
- auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
- // GB6
- if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
- continue;
- // GB7
- if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
- continue;
- // GB8
- if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
- continue;
- // GB9
- if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
- continue;
- // GB9a
- if (has_any_gbp(next_code_point, GBP::SpacingMark))
- continue;
- // GB9b
- if (has_any_gbp(code_point, GBP::Prepend))
- continue;
- auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
- current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
- // GB12, GB13
- if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
- continue;
- // GB999
- if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
- return;
- }
- }
- // GB2
- callback(code_unit_length(view));
- #endif
- }
- void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
- {
- for_each_grapheme_segmentation_boundary_impl(view, move(callback));
- }
- void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
- {
- for_each_grapheme_segmentation_boundary_impl(view, move(callback));
- }
- void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
- {
- for_each_grapheme_segmentation_boundary_impl(view, move(callback));
- }
- template<typename ViewType>
- static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
- {
- #if ENABLE_UNICODE_DATA
- using WBP = WordBreakProperty;
- // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
- if (view.is_empty())
- return;
- auto has_any_wbp = [](u32 code_point, auto&&... properties) {
- return (code_point_has_word_break_property(code_point, properties) || ...);
- };
- // WB1
- if (callback(0) == IterationDecision::Break)
- return;
- if (code_unit_length(view) > 1) {
- auto it = view.begin();
- auto code_point = *it;
- u32 next_code_point;
- Optional<u32> previous_code_point;
- auto current_ri_chain = 0;
- for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
- next_code_point = *it;
- auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
- auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
- // WB3
- if (code_point_is_cr && next_code_point_is_lf)
- continue;
- // WB3a, WB3b
- if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
- if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
- return;
- continue;
- }
- // WB3c
- if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
- continue;
- // WB3d
- if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
- continue;
- // WB4
- if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
- continue;
- auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
- auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
- auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
- auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
- // WB5
- if (code_point_is_ah_letter && next_code_point_is_ah_letter)
- continue;
- Optional<u32> next_next_code_point;
- if (it != view.end()) {
- auto it_copy = it;
- ++it_copy;
- if (it_copy != view.end())
- next_next_code_point = *it_copy;
- }
- bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
- bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
- auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
- // WB6
- if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
- continue;
- auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
- auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
- auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
- // WB7
- if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
- continue;
- // WB7a
- if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
- continue;
- // WB7b
- if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
- continue;
- // WB7c
- if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
- continue;
- auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
- auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
- // WB8
- if (code_point_is_numeric && next_code_point_is_numeric)
- continue;
- // WB9
- if (code_point_is_ah_letter && next_code_point_is_numeric)
- continue;
- // WB10
- if (code_point_is_numeric && next_code_point_is_ah_letter)
- continue;
- auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
- // WB11
- if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
- continue;
- bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
- // WB12
- if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
- continue;
- auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
- auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
- // WB13
- if (code_point_is_katakana && next_code_point_is_katakana)
- continue;
- auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
- // WB13a
- if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
- continue;
- // WB13b
- if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
- continue;
- auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
- current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
- // WB15, WB16
- if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
- continue;
- // WB999
- if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
- return;
- }
- }
- // WB2
- callback(code_unit_length(view));
- #endif
- }
- void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
- {
- for_each_word_segmentation_boundary_impl(view, move(callback));
- }
- void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
- {
- for_each_word_segmentation_boundary_impl(view, move(callback));
- }
- void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
- {
- for_each_word_segmentation_boundary_impl(view, move(callback));
- }
- template<typename ViewType>
- static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
- {
- #if ENABLE_UNICODE_DATA
- using SBP = SentenceBreakProperty;
- // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- if (view.is_empty())
- return;
- auto has_any_sbp = [](u32 code_point, auto&&... properties) {
- return (code_point_has_sentence_break_property(code_point, properties) || ...);
- };
- // SB1
- if (callback(0) == IterationDecision::Break)
- return;
- if (code_unit_length(view) > 1) {
- auto it = view.begin();
- auto code_point = *it;
- u32 next_code_point;
- Optional<u32> previous_code_point;
- enum class TerminatorSequenceState {
- None,
- Term,
- Close,
- Sp
- } terminator_sequence_state { TerminatorSequenceState::None };
- auto term_was_a_term = false;
- for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
- next_code_point = *it;
- auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
- auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
- // SB3
- if (code_point_is_cr && next_code_point_is_lf)
- continue;
- auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
- // SB4
- if (code_point_is_para_sep) {
- if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
- return;
- continue;
- }
- // SB5
- if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
- continue;
- auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
- // SB6
- if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
- continue;
- // SB7
- if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
- continue;
- if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
- terminator_sequence_state = TerminatorSequenceState::Term;
- term_was_a_term = code_point_is_a_term;
- } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
- terminator_sequence_state = TerminatorSequenceState::Close;
- } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
- terminator_sequence_state = TerminatorSequenceState::Sp;
- } else {
- terminator_sequence_state = TerminatorSequenceState::None;
- }
- // SB8
- if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
- auto it_copy = it;
- bool illegal_sequence = false;
- for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
- if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
- continue;
- illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
- }
- if (illegal_sequence)
- continue;
- }
- // SB8a
- if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
- continue;
- auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
- auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
- // SB9
- if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
- continue;
- // SB10
- if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
- continue;
- // SB11
- if (terminator_sequence_state >= TerminatorSequenceState::Term)
- if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
- return;
- // SB998
- }
- }
- // SB2
- callback(code_unit_length(view));
- #endif
- }
- void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
- {
- for_each_sentence_segmentation_boundary_impl(view, move(callback));
- }
- void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
- {
- for_each_sentence_segmentation_boundary_impl(view, move(callback));
- }
- void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
- {
- for_each_sentence_segmentation_boundary_impl(view, move(callback));
- }
- }
|