Segmenter.cpp 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. /*
  2. * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Utf16View.h>
  7. #include <AK/Utf32View.h>
  8. #include <LibUnicode/ICU.h>
  9. #include <LibUnicode/Locale.h>
  10. #include <LibUnicode/Segmenter.h>
  11. #include <unicode/brkiter.h>
  12. #include <unicode/utext.h>
  13. #include <unicode/utf8.h>
  14. namespace Unicode {
  15. SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
  16. {
  17. if (segmenter_granularity == "grapheme"sv)
  18. return SegmenterGranularity::Grapheme;
  19. if (segmenter_granularity == "sentence"sv)
  20. return SegmenterGranularity::Sentence;
  21. if (segmenter_granularity == "word"sv)
  22. return SegmenterGranularity::Word;
  23. VERIFY_NOT_REACHED();
  24. }
  25. StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
  26. {
  27. switch (segmenter_granularity) {
  28. case SegmenterGranularity::Grapheme:
  29. return "grapheme"sv;
  30. case SegmenterGranularity::Sentence:
  31. return "sentence"sv;
  32. case SegmenterGranularity::Word:
  33. return "word"sv;
  34. }
  35. VERIFY_NOT_REACHED();
  36. }
  37. class SegmenterImpl : public Segmenter {
  38. public:
  39. SegmenterImpl(NonnullOwnPtr<icu::BreakIterator> segmenter, SegmenterGranularity segmenter_granularity)
  40. : Segmenter(segmenter_granularity)
  41. , m_segmenter(move(segmenter))
  42. {
  43. }
  44. virtual ~SegmenterImpl() override = default;
  45. virtual NonnullOwnPtr<Segmenter> clone() const override
  46. {
  47. return make<SegmenterImpl>(adopt_own(*m_segmenter->clone()), m_segmenter_granularity);
  48. }
  49. virtual void set_segmented_text(String text) override
  50. {
  51. UErrorCode status = U_ZERO_ERROR;
  52. m_segmented_text = move(text);
  53. auto view = m_segmented_text.get<String>().bytes_as_string_view();
  54. UText utext = UTEXT_INITIALIZER;
  55. utext_openUTF8(&utext, view.characters_without_null_termination(), static_cast<i64>(view.length()), &status);
  56. VERIFY(icu_success(status));
  57. m_segmenter->setText(&utext, status);
  58. VERIFY(icu_success(status));
  59. utext_close(&utext);
  60. }
  61. virtual void set_segmented_text(Utf16View const& text) override
  62. {
  63. m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) };
  64. m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
  65. }
  66. virtual size_t current_boundary() override
  67. {
  68. return m_segmenter->current();
  69. }
  70. virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
  71. {
  72. auto icu_boundary = align_boundary(boundary);
  73. if (inclusive == Inclusive::Yes) {
  74. if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
  75. return static_cast<size_t>(icu_boundary);
  76. }
  77. if (auto index = m_segmenter->preceding(icu_boundary); index != icu::BreakIterator::DONE)
  78. return static_cast<size_t>(index);
  79. return {};
  80. }
  81. virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
  82. {
  83. auto icu_boundary = align_boundary(boundary);
  84. if (inclusive == Inclusive::Yes) {
  85. if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
  86. return static_cast<size_t>(icu_boundary);
  87. }
  88. if (auto index = m_segmenter->following(icu_boundary); index != icu::BreakIterator::DONE)
  89. return static_cast<size_t>(index);
  90. return {};
  91. }
  92. virtual void for_each_boundary(String text, SegmentationCallback callback) override
  93. {
  94. if (text.is_empty())
  95. return;
  96. set_segmented_text(move(text));
  97. for_each_boundary(move(callback));
  98. }
  99. virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
  100. {
  101. if (text.is_empty())
  102. return;
  103. set_segmented_text(text);
  104. for_each_boundary(move(callback));
  105. }
  106. virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
  107. {
  108. if (text.is_empty())
  109. return;
  110. // FIXME: We should be able to create a custom UText provider to avoid converting to UTF-8 here.
  111. set_segmented_text(MUST(String::formatted("{}", text)));
  112. auto code_points = m_segmented_text.get<String>().code_points();
  113. auto current = code_points.begin();
  114. size_t code_point_index = 0;
  115. for_each_boundary([&](auto index) {
  116. auto it = code_points.iterator_at_byte_offset(index);
  117. while (current != it) {
  118. ++code_point_index;
  119. ++current;
  120. }
  121. return callback(code_point_index);
  122. });
  123. }
  124. virtual bool is_current_boundary_word_like() const override
  125. {
  126. auto status = m_segmenter->getRuleStatus();
  127. if (status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)
  128. return true;
  129. if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT)
  130. return true;
  131. if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT)
  132. return true;
  133. if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT)
  134. return true;
  135. return false;
  136. }
  137. private:
  138. i32 align_boundary(size_t boundary)
  139. {
  140. auto icu_boundary = static_cast<i32>(boundary);
  141. return m_segmented_text.visit(
  142. [&](String const& text) {
  143. U8_SET_CP_START(text.bytes().data(), 0, icu_boundary);
  144. return icu_boundary;
  145. },
  146. [&](icu::UnicodeString const& text) {
  147. return text.getChar32Start(icu_boundary);
  148. },
  149. [](Empty) -> i32 { VERIFY_NOT_REACHED(); });
  150. }
  151. void for_each_boundary(SegmentationCallback callback)
  152. {
  153. if (callback(static_cast<size_t>(m_segmenter->first())) == IterationDecision::Break)
  154. return;
  155. while (true) {
  156. auto index = m_segmenter->next();
  157. if (index == icu::BreakIterator::DONE)
  158. return;
  159. if (callback(static_cast<size_t>(index)) == IterationDecision::Break)
  160. return;
  161. }
  162. }
  163. NonnullOwnPtr<icu::BreakIterator> m_segmenter;
  164. Variant<Empty, String, icu::UnicodeString> m_segmented_text;
  165. };
  166. NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
  167. {
  168. return Segmenter::create(default_locale(), segmenter_granularity);
  169. }
  170. NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
  171. {
  172. UErrorCode status = U_ZERO_ERROR;
  173. auto locale_data = LocaleData::for_locale(locale);
  174. VERIFY(locale_data.has_value());
  175. auto segmenter = adopt_own_if_nonnull([&]() {
  176. switch (segmenter_granularity) {
  177. case SegmenterGranularity::Grapheme:
  178. return icu::BreakIterator::createCharacterInstance(locale_data->locale(), status);
  179. case SegmenterGranularity::Sentence:
  180. return icu::BreakIterator::createSentenceInstance(locale_data->locale(), status);
  181. case SegmenterGranularity::Word:
  182. return icu::BreakIterator::createWordInstance(locale_data->locale(), status);
  183. }
  184. VERIFY_NOT_REACHED();
  185. }());
  186. VERIFY(icu_success(status));
  187. return make<SegmenterImpl>(segmenter.release_nonnull(), segmenter_granularity);
  188. }
  189. }