Browse Source

LibLocale: Implement an ICU-based text segmenter

Our current segmenter implementation lives in LibUnicode, and is not
locale-aware. We will need such awareness for ECMA-402, and so LibLocale
will be the new home for text segmentation.

The tests here are ported directly from LibUnicode/TestSegmentation.cpp.
Timothy Flynn 1 year ago
parent
commit
3fe0a27fbd

+ 1 - 0
Tests/LibLocale/CMakeLists.txt

@@ -1,6 +1,7 @@
 set(TEST_SOURCES
     TestDisplayNames.cpp
     TestLocale.cpp
+    TestSegmenter.cpp
 )
 
 foreach(source IN LISTS TEST_SOURCES)

+ 128 - 0
Tests/LibLocale/TestSegmenter.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibTest/TestCase.h>
+
+#include <AK/Array.h>
+#include <AK/String.h>
+#include <AK/StringView.h>
+#include <AK/Vector.h>
+#include <LibLocale/Segmenter.h>
+
+template<size_t N>
+static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
+{
+    Vector<size_t> boundaries;
+    auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
+
+    segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
+        boundaries.append(boundary);
+        return IterationDecision::Continue;
+    });
+
+    EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
+}
+
+TEST_CASE(grapheme_segmentation)
+{
+    auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
+
+    segmenter->for_each_boundary(String {}, [&](auto i) {
+        dbgln("{}", i);
+        VERIFY_NOT_REACHED();
+        return IterationDecision::Break;
+    });
+
+    test_grapheme_segmentation("a"sv, { 0u, 1u });
+    test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
+    test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
+
+    test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
+    test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
+    test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
+
+    test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
+    test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
+    test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
+    test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
+    test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
+
+    test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
+    test_grapheme_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u });
+    test_grapheme_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u });
+}
+
+TEST_CASE(grapheme_segmentation_indic_conjunct_break)
+{
+    test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
+    test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
+    test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
+
+    test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
+    test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
+    test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
+
+    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
+}
+
+template<size_t N>
+static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
+{
+    Vector<size_t> boundaries;
+    auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
+
+    segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
+        boundaries.append(boundary);
+        return IterationDecision::Continue;
+    });
+
+    EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
+}
+
+TEST_CASE(word_segmentation)
+{
+    auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
+
+    segmenter->for_each_boundary(String {}, [&](auto) {
+        VERIFY_NOT_REACHED();
+        return IterationDecision::Break;
+    });
+
+    test_word_segmentation("a"sv, { 0u, 1u });
+    test_word_segmentation("ab"sv, { 0u, 2u });
+    test_word_segmentation("abc"sv, { 0u, 3u });
+
+    test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
+    test_word_segmentation("ab  cd"sv, { 0u, 2u, 4u, 6u });
+    test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
+    test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
+    test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
+    test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
+
+    test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
+    test_word_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u });
+    test_word_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u });
+
+    test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
+    test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
+    test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
+    test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
+
+    test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
+    test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
+
+    test_word_segmentation(
+        "The quick (“brown”) fox can’t jump 32.3 feet, right?"sv,
+        { 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
+}

+ 1 - 0
Userland/Libraries/LibLocale/CMakeLists.txt

@@ -8,6 +8,7 @@ set(SOURCES
     NumberFormat.cpp
     PluralRules.cpp
     RelativeTimeFormat.cpp
+    Segmenter.cpp
     UnicodeKeywords.cpp
 )
 

+ 1 - 0
Userland/Libraries/LibLocale/Forward.h

@@ -17,6 +17,7 @@ enum class Style;
 enum class Weekday;
 
 class NumberFormat;
+class Segmenter;
 
 struct CalendarPattern;
 struct Keyword;

+ 240 - 0
Userland/Libraries/LibLocale/Segmenter.cpp

@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#define AK_DONT_REPLACE_STD
+
+#include <AK/Utf16View.h>
+#include <AK/Utf32View.h>
+#include <LibLocale/ICU.h>
+#include <LibLocale/Locale.h>
+#include <LibLocale/Segmenter.h>
+
+#include <unicode/brkiter.h>
+#include <unicode/utext.h>
+#include <unicode/utf8.h>
+
+namespace Locale {
+
+SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
+{
+    if (segmenter_granularity == "grapheme"sv)
+        return SegmenterGranularity::Grapheme;
+    if (segmenter_granularity == "sentence"sv)
+        return SegmenterGranularity::Sentence;
+    if (segmenter_granularity == "word"sv)
+        return SegmenterGranularity::Word;
+    VERIFY_NOT_REACHED();
+}
+
+StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
+{
+    switch (segmenter_granularity) {
+    case SegmenterGranularity::Grapheme:
+        return "grapheme"sv;
+    case SegmenterGranularity::Sentence:
+        return "sentence"sv;
+    case SegmenterGranularity::Word:
+        return "word"sv;
+    }
+    VERIFY_NOT_REACHED();
+}
+
+class SegmenterImpl : public Segmenter {
+public:
+    SegmenterImpl(NonnullOwnPtr<icu::BreakIterator> segmenter, SegmenterGranularity segmenter_granularity)
+        : Segmenter(segmenter_granularity)
+        , m_segmenter(move(segmenter))
+    {
+    }
+
+    virtual ~SegmenterImpl() override = default;
+
+    virtual NonnullOwnPtr<Segmenter> clone() const override
+    {
+        return make<SegmenterImpl>(adopt_own(*m_segmenter->clone()), m_segmenter_granularity);
+    }
+
+    virtual void set_segmented_text(String text) override
+    {
+        UErrorCode status = U_ZERO_ERROR;
+
+        m_segmented_text = move(text);
+        auto view = m_segmented_text.get<String>().bytes_as_string_view();
+
+        UText utext = UTEXT_INITIALIZER;
+        utext_openUTF8(&utext, view.characters_without_null_termination(), static_cast<i64>(view.length()), &status);
+        VERIFY(icu_success(status));
+
+        m_segmenter->setText(&utext, status);
+        VERIFY(icu_success(status));
+
+        utext_close(&utext);
+    }
+
+    virtual void set_segmented_text(Utf16View const& text) override
+    {
+        m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) };
+        m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
+    }
+
+    virtual size_t current_boundary() override
+    {
+        return m_segmenter->current();
+    }
+
+    virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
+    {
+        auto icu_boundary = align_boundary(boundary);
+
+        if (inclusive == Inclusive::Yes) {
+            if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
+                return static_cast<size_t>(icu_boundary);
+        }
+
+        if (auto index = m_segmenter->preceding(icu_boundary); index != icu::BreakIterator::DONE)
+            return static_cast<size_t>(index);
+
+        return {};
+    }
+
+    virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
+    {
+        auto icu_boundary = align_boundary(boundary);
+
+        if (inclusive == Inclusive::Yes) {
+            if (static_cast<bool>(m_segmenter->isBoundary(icu_boundary)))
+                return static_cast<size_t>(icu_boundary);
+        }
+
+        if (auto index = m_segmenter->following(icu_boundary); index != icu::BreakIterator::DONE)
+            return static_cast<size_t>(index);
+
+        return {};
+    }
+
+    virtual void for_each_boundary(String text, SegmentationCallback callback) override
+    {
+        if (text.is_empty())
+            return;
+
+        set_segmented_text(move(text));
+        for_each_boundary(move(callback));
+    }
+
+    virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
+    {
+        if (text.is_empty())
+            return;
+
+        set_segmented_text(text);
+        for_each_boundary(move(callback));
+    }
+
+    virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
+    {
+        if (text.is_empty())
+            return;
+
+        // FIXME: We should be able to create a custom UText provider to avoid converting to UTF-8 here.
+        set_segmented_text(MUST(String::formatted("{}", text)));
+
+        auto code_points = m_segmented_text.get<String>().code_points();
+        auto current = code_points.begin();
+        size_t code_point_index = 0;
+
+        for_each_boundary([&](auto index) {
+            auto it = code_points.iterator_at_byte_offset(index);
+
+            while (current != it) {
+                ++code_point_index;
+                ++current;
+            }
+
+            return callback(code_point_index);
+        });
+    }
+
+    virtual bool is_current_boundary_word_like() const override
+    {
+        auto status = m_segmenter->getRuleStatus();
+
+        if (status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)
+            return true;
+        if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT)
+            return true;
+        if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT)
+            return true;
+        if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT)
+            return true;
+
+        return false;
+    }
+
+private:
+    i32 align_boundary(size_t boundary)
+    {
+        auto icu_boundary = static_cast<i32>(boundary);
+
+        return m_segmented_text.visit(
+            [&](String const& text) {
+                U8_SET_CP_START(text.bytes().data(), 0, icu_boundary);
+                return icu_boundary;
+            },
+            [&](icu::UnicodeString const& text) {
+                return text.getChar32Start(icu_boundary);
+            },
+            [](Empty) -> i32 { VERIFY_NOT_REACHED(); });
+    }
+
+    void for_each_boundary(SegmentationCallback callback)
+    {
+        if (callback(static_cast<size_t>(m_segmenter->first())) == IterationDecision::Break)
+            return;
+
+        while (true) {
+            auto index = m_segmenter->next();
+            if (index == icu::BreakIterator::DONE)
+                return;
+
+            if (callback(static_cast<size_t>(index)) == IterationDecision::Break)
+                return;
+        }
+    }
+
+    NonnullOwnPtr<icu::BreakIterator> m_segmenter;
+    Variant<Empty, String, icu::UnicodeString> m_segmented_text;
+};
+
+NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
+{
+    return Segmenter::create(default_locale(), segmenter_granularity);
+}
+
+NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
+{
+    UErrorCode status = U_ZERO_ERROR;
+
+    auto locale_data = LocaleData::for_locale(locale);
+    VERIFY(locale_data.has_value());
+
+    auto segmenter = adopt_own_if_nonnull([&]() {
+        switch (segmenter_granularity) {
+        case SegmenterGranularity::Grapheme:
+            return icu::BreakIterator::createCharacterInstance(locale_data->locale(), status);
+        case SegmenterGranularity::Sentence:
+            return icu::BreakIterator::createSentenceInstance(locale_data->locale(), status);
+        case SegmenterGranularity::Word:
+            return icu::BreakIterator::createWordInstance(locale_data->locale(), status);
+        }
+        VERIFY_NOT_REACHED();
+    }());
+
+    VERIFY(icu_success(status));
+
+    return make<SegmenterImpl>(segmenter.release_nonnull(), segmenter_granularity);
+}
+
+}

+ 62 - 0
Userland/Libraries/LibLocale/Segmenter.h

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Function.h>
+#include <AK/NonnullOwnPtr.h>
+#include <AK/Optional.h>
+#include <AK/StringView.h>
+
+namespace Locale {
+
+enum class SegmenterGranularity {
+    Grapheme,
+    Sentence,
+    Word,
+};
+SegmenterGranularity segmenter_granularity_from_string(StringView);
+StringView segmenter_granularity_to_string(SegmenterGranularity);
+
+class Segmenter {
+public:
+    static NonnullOwnPtr<Segmenter> create(SegmenterGranularity segmenter_granularity);
+    static NonnullOwnPtr<Segmenter> create(StringView locale, SegmenterGranularity segmenter_granularity);
+    virtual ~Segmenter() = default;
+
+    SegmenterGranularity segmenter_granularity() const { return m_segmenter_granularity; }
+
+    virtual NonnullOwnPtr<Segmenter> clone() const = 0;
+
+    virtual void set_segmented_text(String) = 0;
+    virtual void set_segmented_text(Utf16View const&) = 0;
+
+    virtual size_t current_boundary() = 0;
+
+    enum class Inclusive {
+        No,
+        Yes,
+    };
+    virtual Optional<size_t> previous_boundary(size_t index, Inclusive = Inclusive::No) = 0;
+    virtual Optional<size_t> next_boundary(size_t index, Inclusive = Inclusive::No) = 0;
+
+    using SegmentationCallback = Function<IterationDecision(size_t)>;
+    virtual void for_each_boundary(String, SegmentationCallback) = 0;
+    virtual void for_each_boundary(Utf16View const&, SegmentationCallback) = 0;
+    virtual void for_each_boundary(Utf32View const&, SegmentationCallback) = 0;
+
+    virtual bool is_current_boundary_word_like() const = 0;
+
+protected:
+    explicit Segmenter(SegmenterGranularity segmenter_granularity)
+        : m_segmenter_granularity(segmenter_granularity)
+    {
+    }
+
+    SegmenterGranularity m_segmenter_granularity { SegmenterGranularity::Grapheme };
+};
+
+}