Segmenter.cpp 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /*
  2. * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #define AK_DONT_REPLACE_STD
  7. #include <AK/Utf16View.h>
  8. #include <AK/Utf32View.h>
  9. #include <LibUnicode/ICU.h>
  10. #include <LibUnicode/Locale.h>
  11. #include <LibUnicode/Segmenter.h>
  12. #include <unicode/brkiter.h>
  13. #include <unicode/utext.h>
  14. #include <unicode/utf8.h>
  15. namespace Unicode {
  16. SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
  17. {
  18. if (segmenter_granularity == "grapheme"sv)
  19. return SegmenterGranularity::Grapheme;
  20. if (segmenter_granularity == "sentence"sv)
  21. return SegmenterGranularity::Sentence;
  22. if (segmenter_granularity == "word"sv)
  23. return SegmenterGranularity::Word;
  24. VERIFY_NOT_REACHED();
  25. }
  26. StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
  27. {
  28. switch (segmenter_granularity) {
  29. case SegmenterGranularity::Grapheme:
  30. return "grapheme"sv;
  31. case SegmenterGranularity::Sentence:
  32. return "sentence"sv;
  33. case SegmenterGranularity::Word:
  34. return "word"sv;
  35. }
  36. VERIFY_NOT_REACHED();
  37. }
  38. class SegmenterImpl : public Segmenter {
  39. public:
  40. SegmenterImpl(NonnullOwnPtr<icu::BreakIterator> segmenter, SegmenterGranularity segmenter_granularity)
  41. : Segmenter(segmenter_granularity)
  42. , m_segmenter(move(segmenter))
  43. {
  44. }
  45. virtual ~SegmenterImpl() override = default;
  46. virtual NonnullOwnPtr<Segmenter> clone() const override
  47. {
  48. return make<SegmenterImpl>(adopt_own(*m_segmenter->clone()), m_segmenter_granularity);
  49. }
  50. virtual void set_segmented_text(String text) override
  51. {
  52. UErrorCode status = U_ZERO_ERROR;
  53. m_segmented_text = move(text);
  54. auto view = m_segmented_text.get<String>().bytes_as_string_view();
  55. UText utext = UTEXT_INITIALIZER;
  56. utext_openUTF8(&utext, view.characters_without_null_termination(), static_cast<i64>(view.length()), &status);
  57. VERIFY(icu_success(status));
  58. m_segmenter->setText(&utext, status);
  59. VERIFY(icu_success(status));
  60. utext_close(&utext);
  61. }
  62. virtual void set_segmented_text(Utf16View const& text) override
  63. {
  64. m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) };
  65. m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
  66. }
  67. virtual size_t current_boundary() override
  68. {
  69. return m_segmenter->current();
  70. }
  71. virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
  72. {
  73. auto icu_boundary = align_boundary(boundary);
  74. if (inclusive == Inclusive::Yes) {
  75. if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
  76. return static_cast<size_t>(icu_boundary);
  77. }
  78. if (auto index = m_segmenter->preceding(icu_boundary); index != icu::BreakIterator::DONE)
  79. return static_cast<size_t>(index);
  80. return {};
  81. }
  82. virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
  83. {
  84. auto icu_boundary = align_boundary(boundary);
  85. if (inclusive == Inclusive::Yes) {
  86. if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
  87. return static_cast<size_t>(icu_boundary);
  88. }
  89. if (auto index = m_segmenter->following(icu_boundary); index != icu::BreakIterator::DONE)
  90. return static_cast<size_t>(index);
  91. return {};
  92. }
  93. virtual void for_each_boundary(String text, SegmentationCallback callback) override
  94. {
  95. if (text.is_empty())
  96. return;
  97. set_segmented_text(move(text));
  98. for_each_boundary(move(callback));
  99. }
  100. virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
  101. {
  102. if (text.is_empty())
  103. return;
  104. set_segmented_text(text);
  105. for_each_boundary(move(callback));
  106. }
  107. virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
  108. {
  109. if (text.is_empty())
  110. return;
  111. // FIXME: We should be able to create a custom UText provider to avoid converting to UTF-8 here.
  112. set_segmented_text(MUST(String::formatted("{}", text)));
  113. auto code_points = m_segmented_text.get<String>().code_points();
  114. auto current = code_points.begin();
  115. size_t code_point_index = 0;
  116. for_each_boundary([&](auto index) {
  117. auto it = code_points.iterator_at_byte_offset(index);
  118. while (current != it) {
  119. ++code_point_index;
  120. ++current;
  121. }
  122. return callback(code_point_index);
  123. });
  124. }
  125. virtual bool is_current_boundary_word_like() const override
  126. {
  127. auto status = m_segmenter->getRuleStatus();
  128. if (status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)
  129. return true;
  130. if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT)
  131. return true;
  132. if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT)
  133. return true;
  134. if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT)
  135. return true;
  136. return false;
  137. }
  138. private:
  139. i32 align_boundary(size_t boundary)
  140. {
  141. auto icu_boundary = static_cast<i32>(boundary);
  142. return m_segmented_text.visit(
  143. [&](String const& text) {
  144. U8_SET_CP_START(text.bytes().data(), 0, icu_boundary);
  145. return icu_boundary;
  146. },
  147. [&](icu::UnicodeString const& text) {
  148. return text.getChar32Start(icu_boundary);
  149. },
  150. [](Empty) -> i32 { VERIFY_NOT_REACHED(); });
  151. }
  152. void for_each_boundary(SegmentationCallback callback)
  153. {
  154. if (callback(static_cast<size_t>(m_segmenter->first())) == IterationDecision::Break)
  155. return;
  156. while (true) {
  157. auto index = m_segmenter->next();
  158. if (index == icu::BreakIterator::DONE)
  159. return;
  160. if (callback(static_cast<size_t>(index)) == IterationDecision::Break)
  161. return;
  162. }
  163. }
  164. NonnullOwnPtr<icu::BreakIterator> m_segmenter;
  165. Variant<Empty, String, icu::UnicodeString> m_segmented_text;
  166. };
  167. NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
  168. {
  169. return Segmenter::create(default_locale(), segmenter_granularity);
  170. }
  171. NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
  172. {
  173. UErrorCode status = U_ZERO_ERROR;
  174. auto locale_data = LocaleData::for_locale(locale);
  175. VERIFY(locale_data.has_value());
  176. auto segmenter = adopt_own_if_nonnull([&]() {
  177. switch (segmenter_granularity) {
  178. case SegmenterGranularity::Grapheme:
  179. return icu::BreakIterator::createCharacterInstance(locale_data->locale(), status);
  180. case SegmenterGranularity::Sentence:
  181. return icu::BreakIterator::createSentenceInstance(locale_data->locale(), status);
  182. case SegmenterGranularity::Word:
  183. return icu::BreakIterator::createWordInstance(locale_data->locale(), status);
  184. }
  185. VERIFY_NOT_REACHED();
  186. }());
  187. VERIFY(icu_success(status));
  188. return make<SegmenterImpl>(segmenter.release_nonnull(), segmenter_granularity);
  189. }
  190. }