diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index b0c96be9b8d..9e3c7129ba5 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -43,15 +43,6 @@ set(EMOJI_DATA_PATH "${UCD_PATH}/${EMOJI_DATA_SOURCE}") set(NORM_PROPS_SOURCE "DerivedNormalizationProps.txt") set(NORM_PROPS_PATH "${UCD_PATH}/${NORM_PROPS_SOURCE}") -set(GRAPHEME_BREAK_PROP_SOURCE "auxiliary/GraphemeBreakProperty.txt") -set(GRAPHEME_BREAK_PROP_PATH "${UCD_PATH}/${GRAPHEME_BREAK_PROP_SOURCE}") - -set(WORD_BREAK_PROP_SOURCE "auxiliary/WordBreakProperty.txt") -set(WORD_BREAK_PROP_PATH "${UCD_PATH}/${WORD_BREAK_PROP_SOURCE}") - -set(SENTENCE_BREAK_PROP_SOURCE "auxiliary/SentenceBreakProperty.txt") -set(SENTENCE_BREAK_PROP_PATH "${UCD_PATH}/${SENTENCE_BREAK_PROP_SOURCE}") - string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}") set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt") set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt") @@ -76,9 +67,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${EMOJI_DATA_SOURCE}" "${EMOJI_DATA_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NORM_PROPS_SOURCE}" "${NORM_PROPS_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${GRAPHEME_BREAK_PROP_SOURCE}" "${GRAPHEME_BREAK_PROP_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${WORD_BREAK_PROP_SOURCE}" "${WORD_BREAK_PROP_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SENTENCE_BREAK_PROP_SOURCE}" "${SENTENCE_BREAK_PROP_PATH}") download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}") else() @@ -103,7 +91,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${UCD_VERSION_FILE}" "${UNICODE_DATA_HEADER}" "${UNICODE_DATA_IMPLEMENTATION}" - arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" + arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" ) invoke_generator( "EmojiData" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index b1d5d27e4d1..591e8e3f577 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -98,17 +98,10 @@ struct UnicodeData { // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize. NormalizationProps normalization_props; - PropList grapheme_break_props; - PropList word_break_props; - PropList sentence_break_props; - CodePointTables general_category_tables; CodePointTables property_tables; CodePointTables script_tables; CodePointTables script_extension_tables; - CodePointTables grapheme_break_tables; - CodePointTables word_break_tables; - CodePointTables sentence_break_tables; HashTable bidirectional_classes; Vector code_point_bidirectional_classes; @@ -439,9 +432,6 @@ namespace Unicode { generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases); generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); - generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys()); - generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys()); - generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys()); generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values()); generator.append(R"~~~( @@ -560,9 +550,6 @@ static constexpr Array<@type@, @size@> @name@ { { TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table)); TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table)); TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table)); - TRY(append_code_point_tables("s_grapheme_break_properties"sv, unicode_data.grapheme_break_tables, append_property_table)); - TRY(append_code_point_tables("s_word_break_properties"sv, unicode_data.word_break_tables, append_property_table)); - TRY(append_code_point_tables("s_sentence_break_properties"sv, unicode_data.sentence_break_tables, append_property_table)); { constexpr size_t max_bidi_classes_per_row = 20; @@ -654,10 +641,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv)); TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases)); - TRY(append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv)); - TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv)); - TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv)); - TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {})); generator.append(R"~~~( @@ -921,18 +904,12 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list)); auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list)); auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions)); - auto grapheme_break_metadata = TRY(PropertyMetadata::create(unicode_data.grapheme_break_props)); - auto word_break_metadata = TRY(PropertyMetadata::create(unicode_data.word_break_props)); - auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props)); for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) { TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata)); TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata)); TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata)); TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata)); - TRY(update_property_tables(code_point, unicode_data.grapheme_break_tables, grapheme_break_metadata)); - TRY(update_property_tables(code_point, unicode_data.word_break_tables, word_break_metadata)); - TRY(update_property_tables(code_point, unicode_data.sentence_break_tables, sentence_break_metadata)); } return {}; @@ -953,9 +930,6 @@ ErrorOr serenity_main(Main::Arguments arguments) StringView script_extensions_path; StringView emoji_data_path; StringView normalization_path; - StringView grapheme_break_path; - StringView word_break_path; - StringView sentence_break_path; Core::ArgsParser args_parser; args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); @@ -971,9 +945,6 @@ ErrorOr serenity_main(Main::Arguments arguments) args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path"); args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path"); args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path"); - args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path"); - args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path"); - args_parser.add_option(sentence_break_path, "Path to SentenceBreakProperty.txt file", "sentence-break-path", 'i', "sentence-break-path"); args_parser.parse(arguments); auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); @@ -989,9 +960,6 @@ ErrorOr serenity_main(Main::Arguments arguments) auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read)); auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read)); auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read)); - auto grapheme_break_file = TRY(open_file(grapheme_break_path, Core::File::OpenMode::Read)); - auto word_break_file = TRY(open_file(word_break_path, Core::File::OpenMode::Read)); - auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read)); UnicodeData unicode_data {}; TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories)); @@ -1003,9 +971,6 @@ ErrorOr serenity_main(Main::Arguments arguments) TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases)); TRY(parse_prop_list(*scripts_file, unicode_data.script_list)); TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true)); - TRY(parse_prop_list(*grapheme_break_file, unicode_data.grapheme_break_props)); - TRY(parse_prop_list(*word_break_file, unicode_data.word_break_props)); - TRY(parse_prop_list(*sentence_break_file, unicode_data.sentence_break_props)); populate_general_category_unions(unicode_data.general_categories); TRY(parse_unicode_data(*unicode_data_file, unicode_data)); diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt index 51cef8c6184..df330dcb480 100644 --- a/Tests/LibUnicode/CMakeLists.txt +++ b/Tests/LibUnicode/CMakeLists.txt @@ -1,7 +1,6 @@ set(TEST_SOURCES TestEmoji.cpp TestIDNA.cpp - TestSegmentation.cpp TestUnicodeCharacterTypes.cpp TestUnicodeNormalization.cpp ) diff --git a/Tests/LibUnicode/TestSegmentation.cpp b/Tests/LibUnicode/TestSegmentation.cpp deleted file mode 100644 index 675ce060fe7..00000000000 --- a/Tests/LibUnicode/TestSegmentation.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2023, Tim Flynn - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include -#include -#include -#include - -template -static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N]) -{ - Vector boundaries; - Utf8View view { string }; - - Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) { - boundaries.append(boundary); - return IterationDecision::Continue; - }); - - EXPECT_EQ(boundaries, ReadonlySpan { expected_boundaries }); -} - -TEST_CASE(grapheme_segmentation) -{ - Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) { - VERIFY_NOT_REACHED(); - return IterationDecision::Break; - }); - - test_grapheme_segmentation("a"sv, { 0u, 1u }); - test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u }); - test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u }); - - test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u }); - test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u }); - test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u }); - - test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u }); - test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u }); - test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u }); - test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u }); - test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u }); - - test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u }); - test_grapheme_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u }); - test_grapheme_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u }); -} - -TEST_CASE(grapheme_segmentation_indic_conjunct_break) -{ - test_grapheme_segmentation("\u0915"sv, { 0u, 3u }); - test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u }); - test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u }); - - test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u }); - - test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u }); - test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u }); - - test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u }); - test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u }); - test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u }); - - test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u }); - test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u }); - - test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u }); -} - -template -static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N]) -{ - Vector boundaries; - Utf8View view { string }; - - Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) { - boundaries.append(boundary); - return IterationDecision::Continue; - }); - - EXPECT_EQ(boundaries, ReadonlySpan { expected_boundaries }); -} - -TEST_CASE(word_segmentation) -{ - Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) { - VERIFY_NOT_REACHED(); - return IterationDecision::Break; - }); - - test_word_segmentation("a"sv, { 0u, 1u }); - test_word_segmentation("ab"sv, { 0u, 2u }); - test_word_segmentation("abc"sv, { 0u, 3u }); - - test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u }); - test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u }); - test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u }); - test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u }); - test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u }); - test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u }); - - test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u }); - test_word_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u }); - test_word_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u }); - - test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u }); - test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u }); - test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u }); - test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u }); - - test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u }); - test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u }); - - test_word_segmentation( - "The quick (“brown”) fox can’t jump 32.3 feet, right?"sv, - { 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u }); -} diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 3daa825f406..a8e317101ec 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -6,7 +6,6 @@ set(SOURCES Emoji.cpp IDNA.cpp Normalize.cpp - Segmentation.cpp String.cpp ${UNICODE_DATA_SOURCES} ) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 55545ca329b..612eaacd269 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -88,10 +88,6 @@ Optional