123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- /*
- * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #define AK_DONT_REPLACE_STD
- #include <AK/Utf16View.h>
- #include <AK/Utf32View.h>
- #include <LibUnicode/ICU.h>
- #include <LibUnicode/Locale.h>
- #include <LibUnicode/Segmenter.h>
- #include <unicode/brkiter.h>
- #include <unicode/utext.h>
- #include <unicode/utf8.h>
- namespace Unicode {
- SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
- {
- if (segmenter_granularity == "grapheme"sv)
- return SegmenterGranularity::Grapheme;
- if (segmenter_granularity == "sentence"sv)
- return SegmenterGranularity::Sentence;
- if (segmenter_granularity == "word"sv)
- return SegmenterGranularity::Word;
- VERIFY_NOT_REACHED();
- }
- StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
- {
- switch (segmenter_granularity) {
- case SegmenterGranularity::Grapheme:
- return "grapheme"sv;
- case SegmenterGranularity::Sentence:
- return "sentence"sv;
- case SegmenterGranularity::Word:
- return "word"sv;
- }
- VERIFY_NOT_REACHED();
- }
- class SegmenterImpl : public Segmenter {
- public:
- SegmenterImpl(NonnullOwnPtr<icu::BreakIterator> segmenter, SegmenterGranularity segmenter_granularity)
- : Segmenter(segmenter_granularity)
- , m_segmenter(move(segmenter))
- {
- }
- virtual ~SegmenterImpl() override = default;
- virtual NonnullOwnPtr<Segmenter> clone() const override
- {
- return make<SegmenterImpl>(adopt_own(*m_segmenter->clone()), m_segmenter_granularity);
- }
- virtual void set_segmented_text(String text) override
- {
- UErrorCode status = U_ZERO_ERROR;
- m_segmented_text = move(text);
- auto view = m_segmented_text.get<String>().bytes_as_string_view();
- UText utext = UTEXT_INITIALIZER;
- utext_openUTF8(&utext, view.characters_without_null_termination(), static_cast<i64>(view.length()), &status);
- VERIFY(icu_success(status));
- m_segmenter->setText(&utext, status);
- VERIFY(icu_success(status));
- utext_close(&utext);
- }
- virtual void set_segmented_text(Utf16View const& text) override
- {
- m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) };
- m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
- }
- virtual size_t current_boundary() override
- {
- return m_segmenter->current();
- }
- virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
- {
- auto icu_boundary = align_boundary(boundary);
- if (inclusive == Inclusive::Yes) {
- if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
- return static_cast<size_t>(icu_boundary);
- }
- if (auto index = m_segmenter->preceding(icu_boundary); index != icu::BreakIterator::DONE)
- return static_cast<size_t>(index);
- return {};
- }
- virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
- {
- auto icu_boundary = align_boundary(boundary);
- if (inclusive == Inclusive::Yes) {
- if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
- return static_cast<size_t>(icu_boundary);
- }
- if (auto index = m_segmenter->following(icu_boundary); index != icu::BreakIterator::DONE)
- return static_cast<size_t>(index);
- return {};
- }
- virtual void for_each_boundary(String text, SegmentationCallback callback) override
- {
- if (text.is_empty())
- return;
- set_segmented_text(move(text));
- for_each_boundary(move(callback));
- }
- virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
- {
- if (text.is_empty())
- return;
- set_segmented_text(text);
- for_each_boundary(move(callback));
- }
- virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
- {
- if (text.is_empty())
- return;
- // FIXME: We should be able to create a custom UText provider to avoid converting to UTF-8 here.
- set_segmented_text(MUST(String::formatted("{}", text)));
- auto code_points = m_segmented_text.get<String>().code_points();
- auto current = code_points.begin();
- size_t code_point_index = 0;
- for_each_boundary([&](auto index) {
- auto it = code_points.iterator_at_byte_offset(index);
- while (current != it) {
- ++code_point_index;
- ++current;
- }
- return callback(code_point_index);
- });
- }
- virtual bool is_current_boundary_word_like() const override
- {
- auto status = m_segmenter->getRuleStatus();
- if (status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)
- return true;
- if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT)
- return true;
- if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT)
- return true;
- if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT)
- return true;
- return false;
- }
- private:
- i32 align_boundary(size_t boundary)
- {
- auto icu_boundary = static_cast<i32>(boundary);
- return m_segmented_text.visit(
- [&](String const& text) {
- U8_SET_CP_START(text.bytes().data(), 0, icu_boundary);
- return icu_boundary;
- },
- [&](icu::UnicodeString const& text) {
- return text.getChar32Start(icu_boundary);
- },
- [](Empty) -> i32 { VERIFY_NOT_REACHED(); });
- }
- void for_each_boundary(SegmentationCallback callback)
- {
- if (callback(static_cast<size_t>(m_segmenter->first())) == IterationDecision::Break)
- return;
- while (true) {
- auto index = m_segmenter->next();
- if (index == icu::BreakIterator::DONE)
- return;
- if (callback(static_cast<size_t>(index)) == IterationDecision::Break)
- return;
- }
- }
- NonnullOwnPtr<icu::BreakIterator> m_segmenter;
- Variant<Empty, String, icu::UnicodeString> m_segmented_text;
- };
- NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
- {
- return Segmenter::create(default_locale(), segmenter_granularity);
- }
- NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
- {
- UErrorCode status = U_ZERO_ERROR;
- auto locale_data = LocaleData::for_locale(locale);
- VERIFY(locale_data.has_value());
- auto segmenter = adopt_own_if_nonnull([&]() {
- switch (segmenter_granularity) {
- case SegmenterGranularity::Grapheme:
- return icu::BreakIterator::createCharacterInstance(locale_data->locale(), status);
- case SegmenterGranularity::Sentence:
- return icu::BreakIterator::createSentenceInstance(locale_data->locale(), status);
- case SegmenterGranularity::Word:
- return icu::BreakIterator::createWordInstance(locale_data->locale(), status);
- }
- VERIFY_NOT_REACHED();
- }());
- VERIFY(icu_success(status));
- return make<SegmenterImpl>(segmenter.release_nonnull(), segmenter_granularity);
- }
- }
|