Ver Fonte

LibUnicode: Remove the locale-unaware text segmentation implementation

Timothy Flynn há 1 ano atrás
pai
commit
ab56b8c8dc

+ 1 - 13
Meta/CMake/unicode_data.cmake

@@ -43,15 +43,6 @@ set(EMOJI_DATA_PATH "${UCD_PATH}/${EMOJI_DATA_SOURCE}")
 set(NORM_PROPS_SOURCE "DerivedNormalizationProps.txt")
 set(NORM_PROPS_PATH "${UCD_PATH}/${NORM_PROPS_SOURCE}")
 
-set(GRAPHEME_BREAK_PROP_SOURCE "auxiliary/GraphemeBreakProperty.txt")
-set(GRAPHEME_BREAK_PROP_PATH "${UCD_PATH}/${GRAPHEME_BREAK_PROP_SOURCE}")
-
-set(WORD_BREAK_PROP_SOURCE "auxiliary/WordBreakProperty.txt")
-set(WORD_BREAK_PROP_PATH "${UCD_PATH}/${WORD_BREAK_PROP_SOURCE}")
-
-set(SENTENCE_BREAK_PROP_SOURCE "auxiliary/SentenceBreakProperty.txt")
-set(SENTENCE_BREAK_PROP_PATH "${UCD_PATH}/${SENTENCE_BREAK_PROP_SOURCE}")
-
 string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}")
 set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt")
 set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt")
@@ -76,9 +67,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${EMOJI_DATA_SOURCE}" "${EMOJI_DATA_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NORM_PROPS_SOURCE}" "${NORM_PROPS_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${GRAPHEME_BREAK_PROP_SOURCE}" "${GRAPHEME_BREAK_PROP_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${WORD_BREAK_PROP_SOURCE}" "${WORD_BREAK_PROP_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SENTENCE_BREAK_PROP_SOURCE}" "${SENTENCE_BREAK_PROP_PATH}")
 
         download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}")
     else()
@@ -103,7 +91,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
         "${UCD_VERSION_FILE}"
         "${UNICODE_DATA_HEADER}"
         "${UNICODE_DATA_IMPLEMENTATION}"
-        arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
+        arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}"
     )
     invoke_generator(
         "EmojiData"

+ 0 - 35
Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp

@@ -98,17 +98,10 @@ struct UnicodeData {
     // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
     NormalizationProps normalization_props;
 
-    PropList grapheme_break_props;
-    PropList word_break_props;
-    PropList sentence_break_props;
-
     CodePointTables<PropertyTable> general_category_tables;
     CodePointTables<PropertyTable> property_tables;
     CodePointTables<PropertyTable> script_tables;
     CodePointTables<PropertyTable> script_extension_tables;
-    CodePointTables<PropertyTable> grapheme_break_tables;
-    CodePointTables<PropertyTable> word_break_tables;
-    CodePointTables<PropertyTable> sentence_break_tables;
 
     HashTable<ByteString> bidirectional_classes;
     Vector<CodePointBidiClass> code_point_bidirectional_classes;
@@ -439,9 +432,6 @@ namespace Unicode {
     generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
     generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
     generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
-    generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
-    generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
-    generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
     generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
 
     generator.append(R"~~~(
@@ -560,9 +550,6 @@ static constexpr Array<@type@, @size@> @name@ { {
     TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table));
     TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
     TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table));
-    TRY(append_code_point_tables("s_grapheme_break_properties"sv, unicode_data.grapheme_break_tables, append_property_table));
-    TRY(append_code_point_tables("s_word_break_properties"sv, unicode_data.word_break_tables, append_property_table));
-    TRY(append_code_point_tables("s_sentence_break_properties"sv, unicode_data.sentence_break_tables, append_property_table));
 
     {
         constexpr size_t max_bidi_classes_per_row = 20;
@@ -654,10 +641,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
     TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv));
     TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases));
 
-    TRY(append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv));
-    TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv));
-    TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv));
-
     TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {}));
 
     generator.append(R"~~~(
@@ -921,18 +904,12 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
     auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list));
     auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
     auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions));
-    auto grapheme_break_metadata = TRY(PropertyMetadata::create(unicode_data.grapheme_break_props));
-    auto word_break_metadata = TRY(PropertyMetadata::create(unicode_data.word_break_props));
-    auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props));
 
     for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
         TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
         TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata));
         TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
         TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata));
-        TRY(update_property_tables(code_point, unicode_data.grapheme_break_tables, grapheme_break_metadata));
-        TRY(update_property_tables(code_point, unicode_data.word_break_tables, word_break_metadata));
-        TRY(update_property_tables(code_point, unicode_data.sentence_break_tables, sentence_break_metadata));
     }
 
     return {};
@@ -953,9 +930,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
     StringView script_extensions_path;
     StringView emoji_data_path;
     StringView normalization_path;
-    StringView grapheme_break_path;
-    StringView word_break_path;
-    StringView sentence_break_path;
 
     Core::ArgsParser args_parser;
     args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
@@ -971,9 +945,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
     args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
     args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
     args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
-    args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path");
-    args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
-    args_parser.add_option(sentence_break_path, "Path to SentenceBreakProperty.txt file", "sentence-break-path", 'i', "sentence-break-path");
     args_parser.parse(arguments);
 
     auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
@@ -989,9 +960,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
     auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
     auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read));
     auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read));
-    auto grapheme_break_file = TRY(open_file(grapheme_break_path, Core::File::OpenMode::Read));
-    auto word_break_file = TRY(open_file(word_break_path, Core::File::OpenMode::Read));
-    auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read));
 
     UnicodeData unicode_data {};
     TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
@@ -1003,9 +971,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
     TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases));
     TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
     TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
-    TRY(parse_prop_list(*grapheme_break_file, unicode_data.grapheme_break_props));
-    TRY(parse_prop_list(*word_break_file, unicode_data.word_break_props));
-    TRY(parse_prop_list(*sentence_break_file, unicode_data.sentence_break_props));
 
     populate_general_category_unions(unicode_data.general_categories);
     TRY(parse_unicode_data(*unicode_data_file, unicode_data));

+ 0 - 1
Tests/LibUnicode/CMakeLists.txt

@@ -1,7 +1,6 @@
 set(TEST_SOURCES
     TestEmoji.cpp
     TestIDNA.cpp
-    TestSegmentation.cpp
     TestUnicodeCharacterTypes.cpp
     TestUnicodeNormalization.cpp
 )

+ 0 - 122
Tests/LibUnicode/TestSegmentation.cpp

@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
-
-#include <AK/Array.h>
-#include <AK/StringView.h>
-#include <AK/Utf8View.h>
-#include <AK/Vector.h>
-#include <LibTest/TestCase.h>
-#include <LibUnicode/Segmentation.h>
-
-template<size_t N>
-static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
-{
-    Vector<size_t> boundaries;
-    Utf8View view { string };
-
-    Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
-        boundaries.append(boundary);
-        return IterationDecision::Continue;
-    });
-
-    EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
-}
-
-TEST_CASE(grapheme_segmentation)
-{
-    Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) {
-        VERIFY_NOT_REACHED();
-        return IterationDecision::Break;
-    });
-
-    test_grapheme_segmentation("a"sv, { 0u, 1u });
-    test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
-    test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
-
-    test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
-    test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
-    test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
-
-    test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
-    test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
-    test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
-    test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
-    test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
-
-    test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
-    test_grapheme_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u });
-    test_grapheme_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u });
-}
-
-TEST_CASE(grapheme_segmentation_indic_conjunct_break)
-{
-    test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
-    test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
-    test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
-
-    test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
-
-    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
-    test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
-
-    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
-    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
-    test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
-
-    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
-    test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
-
-    test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
-}
-
-template<size_t N>
-static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
-{
-    Vector<size_t> boundaries;
-    Utf8View view { string };
-
-    Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) {
-        boundaries.append(boundary);
-        return IterationDecision::Continue;
-    });
-
-    EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
-}
-
-TEST_CASE(word_segmentation)
-{
-    Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) {
-        VERIFY_NOT_REACHED();
-        return IterationDecision::Break;
-    });
-
-    test_word_segmentation("a"sv, { 0u, 1u });
-    test_word_segmentation("ab"sv, { 0u, 2u });
-    test_word_segmentation("abc"sv, { 0u, 3u });
-
-    test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
-    test_word_segmentation("ab  cd"sv, { 0u, 2u, 4u, 6u });
-    test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
-    test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
-    test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
-    test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
-
-    test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
-    test_word_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u });
-    test_word_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u });
-
-    test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
-    test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
-    test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
-    test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
-
-    test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
-    test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
-
-    test_word_segmentation(
-        "The quick (“brown”) fox can’t jump 32.3 feet, right?"sv,
-        { 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
-}

+ 0 - 1
Userland/Libraries/LibUnicode/CMakeLists.txt

@@ -6,7 +6,6 @@ set(SOURCES
     Emoji.cpp
     IDNA.cpp
     Normalize.cpp
-    Segmentation.cpp
     String.cpp
     ${UNICODE_DATA_SOURCES}
 )

+ 0 - 4
Userland/Libraries/LibUnicode/CharacterTypes.cpp

@@ -88,10 +88,6 @@ Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {
 bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
 bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
 
-bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
-bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
-bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
-
 Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
 Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
 

+ 0 - 4
Userland/Libraries/LibUnicode/CharacterTypes.h

@@ -37,10 +37,6 @@ Optional<Script> script_from_string(StringView);
 bool code_point_has_script(u32 code_point, Script script);
 bool code_point_has_script_extension(u32 code_point, Script script);
 
-bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakProperty property);
-bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
-bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
-
 Optional<BidirectionalClass> bidirectional_class_from_string(StringView);
 Optional<BidirectionalClass> bidirectional_class(u32 code_point);
 

+ 0 - 3
Userland/Libraries/LibUnicode/Forward.h

@@ -13,11 +13,8 @@ namespace Unicode {
 enum class BidirectionalClass : u8;
 enum class EmojiGroup : u8;
 enum class GeneralCategory : u8;
-enum class GraphemeBreakProperty : u8;
 enum class Property : u8;
 enum class Script : u8;
-enum class SentenceBreakProperty : u8;
-enum class WordBreakProperty : u8;
 
 struct CurrencyCode;
 struct Emoji;

+ 0 - 491
Userland/Libraries/LibUnicode/Segmentation.cpp

@@ -1,491 +0,0 @@
-/*
- * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
- * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
-
-#include <AK/Utf16View.h>
-#include <AK/Utf32View.h>
-#include <AK/Utf8View.h>
-#include <LibUnicode/CharacterTypes.h>
-#include <LibUnicode/Segmentation.h>
-
-#if ENABLE_UNICODE_DATA
-#    include <LibUnicode/UnicodeData.h>
-#endif
-
-namespace Unicode {
-
-template<typename ViewType>
-static size_t code_unit_length(ViewType const& view)
-{
-    if constexpr (IsSame<ViewType, Utf8View>)
-        return view.byte_length();
-    else if constexpr (IsSame<ViewType, Utf16View>)
-        return view.length_in_code_units();
-    else if constexpr (IsSame<ViewType, Utf32View>)
-        return view.length();
-    else
-        static_assert(DependentFalse<ViewType>);
-}
-
-template<typename ViewType, typename CodeUnitIterator>
-static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
-{
-    if constexpr (IsSame<ViewType, Utf8View>)
-        return view.byte_offset_of(it);
-    else if constexpr (IsSame<ViewType, Utf16View>)
-        return view.code_unit_offset_of(it);
-    else if constexpr (IsSame<ViewType, Utf32View>)
-        return view.iterator_offset(it);
-    else
-        static_assert(DependentFalse<ViewType>);
-}
-
-template<typename ViewType>
-static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
-{
-#if ENABLE_UNICODE_DATA
-    using GBP = GraphemeBreakProperty;
-
-    // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
-    if (view.is_empty())
-        return;
-
-    auto has_any_gbp = [](u32 code_point, auto&&... properties) {
-        return (code_point_has_grapheme_break_property(code_point, properties) || ...);
-    };
-
-    auto skip_incb_extend_linker_sequence = [&](auto& it) {
-        while (true) {
-            if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
-                return;
-
-            auto next_it = it;
-            ++next_it;
-
-            if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
-                return;
-
-            it = next_it;
-            ++it;
-        }
-    };
-
-    // GB1
-    if (callback(0) == IterationDecision::Break)
-        return;
-
-    if (code_unit_length(view) > 1) {
-        auto it = view.begin();
-        auto code_point = *it;
-        u32 next_code_point = 0;
-        auto current_ri_chain = 0;
-
-        for (++it; it != view.end(); ++it, code_point = next_code_point) {
-            next_code_point = *it;
-
-            // GB9c
-            if (code_point_has_property(code_point, Property::InCB_Consonant)) {
-                auto it_copy = it;
-                skip_incb_extend_linker_sequence(it_copy);
-
-                if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
-                    ++it_copy;
-                    skip_incb_extend_linker_sequence(it_copy);
-
-                    if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
-                        next_code_point = *it_copy;
-                        it = it_copy;
-                        continue;
-                    }
-                }
-            }
-
-            // GB11
-            if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
-                auto it_copy = it;
-
-                while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
-                    ++it_copy;
-
-                if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
-                    ++it_copy;
-
-                    if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
-                        next_code_point = *it_copy;
-                        it = it_copy;
-                        continue;
-                    }
-                }
-            }
-
-            auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
-            auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
-
-            // GB3
-            if (code_point_is_cr && next_code_point_is_lf)
-                continue;
-            // GB4, GB5
-            if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
-                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
-                    return;
-                continue;
-            }
-
-            auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
-            auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
-
-            // GB6
-            if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
-                continue;
-            // GB7
-            if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
-                continue;
-            // GB8
-            if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
-                continue;
-
-            // GB9
-            if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
-                continue;
-            // GB9a
-            if (has_any_gbp(next_code_point, GBP::SpacingMark))
-                continue;
-            // GB9b
-            if (has_any_gbp(code_point, GBP::Prepend))
-                continue;
-
-            auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
-            current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
-
-            // GB12, GB13
-            if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
-                continue;
-
-            // GB999
-            if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
-                return;
-        }
-    }
-
-    // GB2
-    callback(code_unit_length(view));
-#endif
-}
-
-void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
-{
-    for_each_grapheme_segmentation_boundary_impl(view, move(callback));
-}
-
-void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
-{
-    for_each_grapheme_segmentation_boundary_impl(view, move(callback));
-}
-
-void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
-{
-    for_each_grapheme_segmentation_boundary_impl(view, move(callback));
-}
-
-template<typename ViewType>
-static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
-{
-#if ENABLE_UNICODE_DATA
-    using WBP = WordBreakProperty;
-
-    // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
-    if (view.is_empty())
-        return;
-
-    auto has_any_wbp = [](u32 code_point, auto&&... properties) {
-        return (code_point_has_word_break_property(code_point, properties) || ...);
-    };
-
-    // WB1
-    if (callback(0) == IterationDecision::Break)
-        return;
-
-    if (code_unit_length(view) > 1) {
-        auto it = view.begin();
-        auto code_point = *it;
-        u32 next_code_point;
-        Optional<u32> previous_code_point;
-        auto current_ri_chain = 0;
-
-        for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
-            next_code_point = *it;
-
-            auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
-            auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
-
-            // WB3
-            if (code_point_is_cr && next_code_point_is_lf)
-                continue;
-            // WB3a, WB3b
-            if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
-                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
-                    return;
-                continue;
-            }
-            // WB3c
-            if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
-                continue;
-            // WB3d
-            if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
-                continue;
-
-            // WB4
-            if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
-                continue;
-
-            auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
-            auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
-            auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
-            auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
-
-            // WB5
-            if (code_point_is_ah_letter && next_code_point_is_ah_letter)
-                continue;
-
-            Optional<u32> next_next_code_point;
-            if (it != view.end()) {
-                auto it_copy = it;
-                ++it_copy;
-                if (it_copy != view.end())
-                    next_next_code_point = *it_copy;
-            }
-            bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
-            bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
-
-            auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
-
-            // WB6
-            if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
-                continue;
-
-            auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
-            auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
-            auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
-
-            // WB7
-            if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
-                continue;
-            // WB7a
-            if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
-                continue;
-            // WB7b
-            if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
-                continue;
-            // WB7c
-            if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
-                continue;
-
-            auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
-            auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
-
-            // WB8
-            if (code_point_is_numeric && next_code_point_is_numeric)
-                continue;
-            // WB9
-            if (code_point_is_ah_letter && next_code_point_is_numeric)
-                continue;
-            // WB10
-            if (code_point_is_numeric && next_code_point_is_ah_letter)
-                continue;
-
-            auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
-
-            // WB11
-            if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
-                continue;
-
-            bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
-
-            // WB12
-            if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
-                continue;
-
-            auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
-            auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
-
-            // WB13
-            if (code_point_is_katakana && next_code_point_is_katakana)
-                continue;
-
-            auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
-
-            // WB13a
-            if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
-                continue;
-            // WB13b
-            if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
-                continue;
-
-            auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
-            current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
-
-            // WB15, WB16
-            if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
-                continue;
-
-            // WB999
-            if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
-                return;
-        }
-    }
-
-    // WB2
-    callback(code_unit_length(view));
-#endif
-}
-
-void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
-{
-    for_each_word_segmentation_boundary_impl(view, move(callback));
-}
-
-void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
-{
-    for_each_word_segmentation_boundary_impl(view, move(callback));
-}
-
-void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
-{
-    for_each_word_segmentation_boundary_impl(view, move(callback));
-}
-
-template<typename ViewType>
-static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
-{
-#if ENABLE_UNICODE_DATA
-    using SBP = SentenceBreakProperty;
-
-    // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
-    if (view.is_empty())
-        return;
-
-    auto has_any_sbp = [](u32 code_point, auto&&... properties) {
-        return (code_point_has_sentence_break_property(code_point, properties) || ...);
-    };
-
-    // SB1
-    if (callback(0) == IterationDecision::Break)
-        return;
-
-    if (code_unit_length(view) > 1) {
-        auto it = view.begin();
-        auto code_point = *it;
-        u32 next_code_point;
-        Optional<u32> previous_code_point;
-        enum class TerminatorSequenceState {
-            None,
-            Term,
-            Close,
-            Sp
-        } terminator_sequence_state { TerminatorSequenceState::None };
-        auto term_was_a_term = false;
-
-        for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
-            next_code_point = *it;
-
-            auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
-            auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
-
-            // SB3
-            if (code_point_is_cr && next_code_point_is_lf)
-                continue;
-
-            auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
-
-            // SB4
-            if (code_point_is_para_sep) {
-                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
-                    return;
-                continue;
-            }
-
-            // SB5
-            if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
-                continue;
-
-            auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
-
-            // SB6
-            if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
-                continue;
-            // SB7
-            if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
-                continue;
-
-            if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
-                terminator_sequence_state = TerminatorSequenceState::Term;
-                term_was_a_term = code_point_is_a_term;
-            } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
-                terminator_sequence_state = TerminatorSequenceState::Close;
-            } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
-                terminator_sequence_state = TerminatorSequenceState::Sp;
-            } else {
-                terminator_sequence_state = TerminatorSequenceState::None;
-            }
-
-            // SB8
-            if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
-                auto it_copy = it;
-                bool illegal_sequence = false;
-                for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
-                    if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
-                        continue;
-                    illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
-                }
-                if (illegal_sequence)
-                    continue;
-            }
-
-            // SB8a
-            if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
-                continue;
-
-            auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
-            auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
-
-            // SB9
-            if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
-                continue;
-
-            // SB10
-            if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
-                continue;
-
-            // SB11
-            if (terminator_sequence_state >= TerminatorSequenceState::Term)
-                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
-                    return;
-
-            // SB998
-        }
-    }
-
-    // SB2
-    callback(code_unit_length(view));
-#endif
-}
-
-void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
-{
-    for_each_sentence_segmentation_boundary_impl(view, move(callback));
-}
-
-void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
-{
-    for_each_sentence_segmentation_boundary_impl(view, move(callback));
-}
-
-void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
-{
-    for_each_sentence_segmentation_boundary_impl(view, move(callback));
-}
-
-}

+ 0 - 134
Userland/Libraries/LibUnicode/Segmentation.h

@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
- * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
-
-#pragma once
-
-#include <AK/Forward.h>
-#include <AK/Function.h>
-#include <AK/IterationDecision.h>
-#include <AK/Optional.h>
-#include <AK/Types.h>
-
-namespace Unicode {
-
-using SegmentationCallback = Function<IterationDecision(size_t)>;
-
-void for_each_grapheme_segmentation_boundary(Utf8View const&, SegmentationCallback);
-void for_each_grapheme_segmentation_boundary(Utf16View const&, SegmentationCallback);
-void for_each_grapheme_segmentation_boundary(Utf32View const&, SegmentationCallback);
-
-template<typename ViewType>
-Optional<size_t> next_grapheme_segmentation_boundary(ViewType const& view, size_t index)
-{
-    Optional<size_t> result;
-
-    for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
-        if (boundary > index) {
-            result = boundary;
-            return IterationDecision::Break;
-        }
-
-        return IterationDecision::Continue;
-    });
-
-    return result;
-}
-
-template<typename ViewType>
-Optional<size_t> previous_grapheme_segmentation_boundary(ViewType const& view, size_t index)
-{
-    Optional<size_t> result;
-
-    for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
-        if (boundary < index) {
-            result = boundary;
-            return IterationDecision::Continue;
-        }
-
-        return IterationDecision::Break;
-    });
-
-    return result;
-}
-
-void for_each_word_segmentation_boundary(Utf8View const&, SegmentationCallback);
-void for_each_word_segmentation_boundary(Utf16View const&, SegmentationCallback);
-void for_each_word_segmentation_boundary(Utf32View const&, SegmentationCallback);
-
-template<typename ViewType>
-Optional<size_t> next_word_segmentation_boundary(ViewType const& view, size_t index)
-{
-    Optional<size_t> result;
-
-    for_each_word_segmentation_boundary(view, [&](auto boundary) {
-        if (boundary > index) {
-            result = boundary;
-            return IterationDecision::Break;
-        }
-
-        return IterationDecision::Continue;
-    });
-
-    return result;
-}
-
-template<typename ViewType>
-Optional<size_t> previous_word_segmentation_boundary(ViewType const& view, size_t index)
-{
-    Optional<size_t> result;
-
-    for_each_word_segmentation_boundary(view, [&](auto boundary) {
-        if (boundary < index) {
-            result = boundary;
-            return IterationDecision::Continue;
-        }
-
-        return IterationDecision::Break;
-    });
-
-    return result;
-}
-
-void for_each_sentence_segmentation_boundary(Utf8View const&, SegmentationCallback);
-void for_each_sentence_segmentation_boundary(Utf16View const&, SegmentationCallback);
-void for_each_sentence_segmentation_boundary(Utf32View const&, SegmentationCallback);
-
-template<typename ViewType>
-Optional<size_t> next_sentence_segmentation_boundary(ViewType const& view, size_t index)
-{
-    Optional<size_t> result;
-
-    for_each_sentence_segmentation_boundary(view, [&](auto boundary) {
-        if (boundary > index) {
-            result = boundary;
-            return IterationDecision::Break;
-        }
-
-        return IterationDecision::Continue;
-    });
-
-    return result;
-}
-
-template<typename ViewType>
-Optional<size_t> previous_sentence_segmentation_boundary(ViewType const& view, size_t index)
-{
-    Optional<size_t> result;
-
-    for_each_sentence_segmentation_boundary(view, [&](auto boundary) {
-        if (boundary < index) {
-            result = boundary;
-            return IterationDecision::Continue;
-        }
-
-        return IterationDecision::Break;
-    });
-
-    return result;
-}
-
-}