Procházet zdrojové kódy

LibUnicode: Replace code point properties with ICU

Timothy Flynn před 1 rokem
rodič
revize
c804bda5fd

+ 1 - 25
Meta/CMake/unicode_data.cmake

@@ -16,18 +16,6 @@ set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}")
 set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt")
 set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt")
 set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}")
 set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}")
 
 
-set(PROP_LIST_SOURCE "PropList.txt")
-set(PROP_LIST_PATH "${UCD_PATH}/${PROP_LIST_SOURCE}")
-
-set(DERIVED_CORE_PROP_SOURCE "DerivedCoreProperties.txt")
-set(DERIVED_CORE_PROP_PATH "${UCD_PATH}/${DERIVED_CORE_PROP_SOURCE}")
-
-set(DERIVED_BINARY_PROP_SOURCE "extracted/DerivedBinaryProperties.txt")
-set(DERIVED_BINARY_PROP_PATH "${UCD_PATH}/${DERIVED_BINARY_PROP_SOURCE}")
-
-set(PROP_ALIAS_SOURCE "PropertyAliases.txt")
-set(PROP_ALIAS_PATH "${UCD_PATH}/${PROP_ALIAS_SOURCE}")
-
 set(PROP_VALUE_ALIAS_SOURCE "PropertyValueAliases.txt")
 set(PROP_VALUE_ALIAS_SOURCE "PropertyValueAliases.txt")
 set(PROP_VALUE_ALIAS_PATH "${UCD_PATH}/${PROP_VALUE_ALIAS_SOURCE}")
 set(PROP_VALUE_ALIAS_PATH "${UCD_PATH}/${PROP_VALUE_ALIAS_SOURCE}")
 
 
@@ -37,12 +25,6 @@ set(SCRIPTS_PATH "${UCD_PATH}/${SCRIPTS_SOURCE}")
 set(SCRIPT_EXTENSIONS_SOURCE "ScriptExtensions.txt")
 set(SCRIPT_EXTENSIONS_SOURCE "ScriptExtensions.txt")
 set(SCRIPT_EXTENSIONS_PATH "${UCD_PATH}/${SCRIPT_EXTENSIONS_SOURCE}")
 set(SCRIPT_EXTENSIONS_PATH "${UCD_PATH}/${SCRIPT_EXTENSIONS_SOURCE}")
 
 
-set(EMOJI_DATA_SOURCE "emoji/emoji-data.txt")
-set(EMOJI_DATA_PATH "${UCD_PATH}/${EMOJI_DATA_SOURCE}")
-
-set(NORM_PROPS_SOURCE "DerivedNormalizationProps.txt")
-set(NORM_PROPS_PATH "${UCD_PATH}/${NORM_PROPS_SOURCE}")
-
 string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}")
 string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}")
 set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt")
 set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt")
 set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt")
 set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt")
@@ -58,15 +40,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
         download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}")
         download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_BINARY_PROP_SOURCE}" "${DERIVED_BINARY_PROP_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_ALIAS_SOURCE}" "${PROP_ALIAS_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_VALUE_ALIAS_SOURCE}" "${PROP_VALUE_ALIAS_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_VALUE_ALIAS_SOURCE}" "${PROP_VALUE_ALIAS_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPTS_SOURCE}" "${SCRIPTS_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPTS_SOURCE}" "${SCRIPTS_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}")
         extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${EMOJI_DATA_SOURCE}" "${EMOJI_DATA_PATH}")
-        extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NORM_PROPS_SOURCE}" "${NORM_PROPS_PATH}")
 
 
         download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}")
         download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}")
     else()
     else()
@@ -91,7 +67,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
         "${UCD_VERSION_FILE}"
         "${UCD_VERSION_FILE}"
         "${UNICODE_DATA_HEADER}"
         "${UNICODE_DATA_HEADER}"
         "${UNICODE_DATA_IMPLEMENTATION}"
         "${UNICODE_DATA_IMPLEMENTATION}"
-        arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}"
+        arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}"
     )
     )
     invoke_generator(
     invoke_generator(
         "EmojiData"
         "EmojiData"

+ 1 - 172
Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp

@@ -24,21 +24,6 @@
 // https://www.unicode.org/reports/tr44/#PropList.txt
 // https://www.unicode.org/reports/tr44/#PropList.txt
 using PropList = HashMap<ByteString, Vector<Unicode::CodePointRange>>;
 using PropList = HashMap<ByteString, Vector<Unicode::CodePointRange>>;
 
 
-// https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt
-enum class QuickCheck {
-    Yes,
-    No,
-    Maybe,
-};
-
-struct Normalization {
-    Unicode::CodePointRange code_point_range;
-    Vector<u32> value;
-    QuickCheck quick_check { QuickCheck::Yes };
-};
-
-using NormalizationProps = HashMap<ByteString, Vector<Normalization>>;
-
 // https://www.unicode.org/reports/tr44/#UnicodeData.txt
 // https://www.unicode.org/reports/tr44/#UnicodeData.txt
 struct CodePointData {
 struct CodePointData {
     u32 code_point { 0 };
     u32 code_point { 0 };
@@ -79,27 +64,13 @@ struct UnicodeData {
     PropList general_categories;
     PropList general_categories;
     Vector<Alias> general_category_aliases;
     Vector<Alias> general_category_aliases;
 
 
-    // The Unicode standard defines additional properties (Any, Assigned, ASCII) which are not in
-    // any UCD file. Assigned code point ranges are derived as this generator is executed.
-    // https://unicode.org/reports/tr18/#General_Category_Property
-    PropList prop_list {
-        { "Any"sv, { { 0, 0x10ffff } } },
-        { "Assigned"sv, {} },
-        { "ASCII"sv, { { 0, 0x7f } } },
-    };
-    Vector<Alias> prop_aliases;
-
     PropList script_list {
     PropList script_list {
         { "Unknown"sv, {} },
         { "Unknown"sv, {} },
     };
     };
     Vector<Alias> script_aliases;
     Vector<Alias> script_aliases;
     PropList script_extensions;
     PropList script_extensions;
 
 
-    // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
-    NormalizationProps normalization_props;
-
     CodePointTables<PropertyTable> general_category_tables;
     CodePointTables<PropertyTable> general_category_tables;
-    CodePointTables<PropertyTable> property_tables;
     CodePointTables<PropertyTable> script_tables;
     CodePointTables<PropertyTable> script_tables;
     CodePointTables<PropertyTable> script_extension_tables;
     CodePointTables<PropertyTable> script_extension_tables;
 
 
@@ -172,52 +143,6 @@ static ErrorOr<void> parse_prop_list(Core::InputBufferedFile& file, PropList& pr
     return {};
     return {};
 }
 }
 
 
-static ErrorOr<void> parse_alias_list(Core::InputBufferedFile& file, PropList const& prop_list, Vector<Alias>& prop_aliases)
-{
-    ByteString current_property;
-    Array<u8, 1024> buffer;
-
-    auto append_alias = [&](auto alias, auto property) {
-        // Note: The alias files contain lines such as "Hyphen = Hyphen", which we should just skip.
-        if (alias == property)
-            return;
-
-        // FIXME: We will, eventually, need to find where missing properties are located and parse them.
-        if (!prop_list.contains(property))
-            return;
-
-        prop_aliases.append({ property, alias });
-    };
-
-    while (TRY(file.can_read_line())) {
-        auto line = TRY(file.read_line(buffer));
-
-        if (line.is_empty() || line.starts_with('#')) {
-            if (line.ends_with("Properties"sv))
-                current_property = line.substring_view(2);
-            continue;
-        }
-
-        // Note: For now, we only care about Binary Property aliases for Unicode property escapes.
-        if (current_property != "Binary Properties"sv)
-            continue;
-
-        auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
-        VERIFY((segments.size() == 2) || (segments.size() == 3));
-
-        auto alias = segments[0].trim_whitespace();
-        auto property = segments[1].trim_whitespace();
-        append_alias(alias, property);
-
-        if (segments.size() == 3) {
-            alias = segments[2].trim_whitespace();
-            append_alias(alias, property);
-        }
-    }
-
-    return {};
-}
-
 static ErrorOr<void> parse_value_alias_list(Core::InputBufferedFile& file, StringView desired_category, Vector<ByteString> const& value_list, Vector<Alias>& prop_aliases, bool primary_value_is_first = true, bool sanitize_alias = false)
 static ErrorOr<void> parse_value_alias_list(Core::InputBufferedFile& file, StringView desired_category, Vector<ByteString> const& value_list, Vector<Alias>& prop_aliases, bool primary_value_is_first = true, bool sanitize_alias = false)
 {
 {
     TRY(file.seek(0, SeekMode::SetPosition));
     TRY(file.seek(0, SeekMode::SetPosition));
@@ -264,57 +189,9 @@ static ErrorOr<void> parse_value_alias_list(Core::InputBufferedFile& file, Strin
     return {};
     return {};
 }
 }
 
 
-static ErrorOr<void> parse_normalization_props(Core::InputBufferedFile& file, UnicodeData& unicode_data)
-{
-    Array<u8, 1024> buffer;
-
-    while (TRY(file.can_read_line())) {
-        auto line = TRY(file.read_line(buffer));
-
-        if (line.is_empty() || line.starts_with('#'))
-            continue;
-
-        if (auto index = line.find('#'); index.has_value())
-            line = line.substring_view(0, *index);
-
-        auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
-        VERIFY((segments.size() == 2) || (segments.size() == 3));
-
-        auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
-        auto property = segments[1].trim_whitespace().to_byte_string();
-
-        Vector<u32> value;
-        QuickCheck quick_check = QuickCheck::Yes;
-
-        if (segments.size() == 3) {
-            auto value_or_quick_check = segments[2].trim_whitespace();
-
-            if ((value_or_quick_check == "N"sv))
-                quick_check = QuickCheck::No;
-            else if ((value_or_quick_check == "M"sv))
-                quick_check = QuickCheck::Maybe;
-            else
-                value = parse_code_point_list(value_or_quick_check);
-        }
-
-        auto& normalizations = unicode_data.normalization_props.ensure(property);
-        normalizations.append({ code_point_range, move(value), quick_check });
-
-        auto& prop_list = unicode_data.prop_list.ensure(property);
-        prop_list.append(move(code_point_range));
-    }
-
-    return {};
-}
-
 static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data)
 static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data)
 {
 {
     Optional<u32> code_point_range_start;
     Optional<u32> code_point_range_start;
-
-    auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value;
-    Optional<u32> assigned_code_point_range_start = 0;
-    u32 previous_code_point = 0;
-
     Array<u8, 1024> buffer;
     Array<u8, 1024> buffer;
 
 
     while (TRY(file.can_read_line())) {
     while (TRY(file.can_read_line())) {
@@ -337,22 +214,15 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
         data.unicode_1_name = segments[10];
         data.unicode_1_name = segments[10];
         data.iso_comment = segments[11];
         data.iso_comment = segments[11];
 
 
-        if (!assigned_code_point_range_start.has_value())
-            assigned_code_point_range_start = data.code_point;
-
         if (data.name.starts_with("<"sv) && data.name.ends_with(", First>"sv)) {
         if (data.name.starts_with("<"sv) && data.name.ends_with(", First>"sv)) {
-            VERIFY(!code_point_range_start.has_value() && assigned_code_point_range_start.has_value());
+            VERIFY(!code_point_range_start.has_value());
             code_point_range_start = data.code_point;
             code_point_range_start = data.code_point;
 
 
             data.name = data.name.substring(1, data.name.length() - 9);
             data.name = data.name.substring(1, data.name.length() - 9);
-
-            assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
-            assigned_code_point_range_start.clear();
         } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>"sv)) {
         } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>"sv)) {
             VERIFY(code_point_range_start.has_value());
             VERIFY(code_point_range_start.has_value());
 
 
             Unicode::CodePointRange code_point_range { *code_point_range_start, data.code_point };
             Unicode::CodePointRange code_point_range { *code_point_range_start, data.code_point };
-            assigned_code_points.append(code_point_range);
 
 
             data.name = data.name.substring(1, data.name.length() - 8);
             data.name = data.name.substring(1, data.name.length() - 8);
             code_point_range_start.clear();
             code_point_range_start.clear();
@@ -360,18 +230,9 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
             unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class });
             unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class });
         } else {
         } else {
             unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class });
             unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class });
-
-            if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) {
-                VERIFY(assigned_code_point_range_start.has_value());
-
-                assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
-                assigned_code_point_range_start = data.code_point;
-            }
         }
         }
 
 
         unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
         unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
-
-        previous_code_point = data.code_point;
         unicode_data.code_point_data.append(move(data));
         unicode_data.code_point_data.append(move(data));
     }
     }
 
 
@@ -430,7 +291,6 @@ namespace Unicode {
 )~~~");
 )~~~");
 
 
     generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
     generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
-    generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
     generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
     generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
     generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
     generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
 
 
@@ -460,7 +320,6 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFil
 #include <AK/StringView.h>
 #include <AK/StringView.h>
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/UnicodeData.h>
 #include <LibUnicode/UnicodeData.h>
-#include <LibUnicode/Normalize.h>
 
 
 namespace Unicode {
 namespace Unicode {
 )~~~");
 )~~~");
@@ -547,7 +406,6 @@ static constexpr Array<@type@, @size@> @name@ { {
     };
     };
 
 
     TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table));
     TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table));
-    TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table));
     TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
     TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
     TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table));
     TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table));
 
 
@@ -634,9 +492,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
     TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv));
     TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv));
     TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases));
     TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases));
 
 
-    TRY(append_prop_search("Property"sv, "property"sv, "s_properties"sv));
-    TRY(append_from_string("Property"sv, "property"sv, unicode_data.prop_list, unicode_data.prop_aliases));
-
     TRY(append_prop_search("Script"sv, "script"sv, "s_scripts"sv));
     TRY(append_prop_search("Script"sv, "script"sv, "s_scripts"sv));
     TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv));
     TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv));
     TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases));
     TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases));
@@ -901,13 +756,11 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
     };
     };
 
 
     auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories));
     auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories));
-    auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list));
     auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
     auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
     auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions));
     auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions));
 
 
     for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
     for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
         TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
         TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
-        TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata));
         TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
         TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
         TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata));
         TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata));
     }
     }
@@ -921,54 +774,30 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
     StringView generated_implementation_path;
     StringView generated_implementation_path;
     StringView unicode_data_path;
     StringView unicode_data_path;
     StringView derived_general_category_path;
     StringView derived_general_category_path;
-    StringView prop_list_path;
-    StringView derived_core_prop_path;
-    StringView derived_binary_prop_path;
-    StringView prop_alias_path;
     StringView prop_value_alias_path;
     StringView prop_value_alias_path;
     StringView scripts_path;
     StringView scripts_path;
     StringView script_extensions_path;
     StringView script_extensions_path;
-    StringView emoji_data_path;
-    StringView normalization_path;
 
 
     Core::ArgsParser args_parser;
     Core::ArgsParser args_parser;
     args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
     args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
     args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
     args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
     args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
     args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
     args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
     args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
-    args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
-    args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
-    args_parser.add_option(derived_binary_prop_path, "Path to DerivedBinaryProperties.txt file", "derived-binary-prop-path", 'b', "derived-binary-prop-path");
-    args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
     args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
     args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
     args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
     args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
     args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
     args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
-    args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
-    args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
     args_parser.parse(arguments);
     args_parser.parse(arguments);
 
 
     auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
     auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
     auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
     auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
     auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read));
     auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read));
     auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read));
     auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read));
-    auto prop_list_file = TRY(open_file(prop_list_path, Core::File::OpenMode::Read));
-    auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::File::OpenMode::Read));
-    auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::File::OpenMode::Read));
-    auto prop_alias_file = TRY(open_file(prop_alias_path, Core::File::OpenMode::Read));
     auto prop_value_alias_file = TRY(open_file(prop_value_alias_path, Core::File::OpenMode::Read));
     auto prop_value_alias_file = TRY(open_file(prop_value_alias_path, Core::File::OpenMode::Read));
     auto scripts_file = TRY(open_file(scripts_path, Core::File::OpenMode::Read));
     auto scripts_file = TRY(open_file(scripts_path, Core::File::OpenMode::Read));
     auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
     auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
-    auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read));
-    auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read));
 
 
     UnicodeData unicode_data {};
     UnicodeData unicode_data {};
     TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
     TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
-    TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list));
-    TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list));
-    TRY(parse_prop_list(*derived_binary_prop_file, unicode_data.prop_list));
-    TRY(parse_prop_list(*emoji_data_file, unicode_data.prop_list));
-    TRY(parse_normalization_props(*normalization_file, unicode_data));
-    TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases));
     TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
     TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
     TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
     TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
 
 

+ 1 - 5
Userland/Libraries/LibGfx/TextLayout.cpp

@@ -19,16 +19,12 @@ DrawGlyphOrEmoji prepare_draw_glyph_or_emoji(FloatPoint point, Utf8CodePointIter
     auto next_code_point = it.peek(1);
     auto next_code_point = it.peek(1);
 
 
     ScopeGuard consume_variation_selector = [&, initial_it = it] {
     ScopeGuard consume_variation_selector = [&, initial_it = it] {
-        static auto const variation_selector = Unicode::property_from_string("Variation_Selector"sv);
-        if (!variation_selector.has_value())
-            return;
-
         // If we advanced the iterator to consume an emoji sequence, don't look for another variation selector.
         // If we advanced the iterator to consume an emoji sequence, don't look for another variation selector.
         if (initial_it != it)
         if (initial_it != it)
             return;
             return;
 
 
         // Otherwise, discard one code point if it's a variation selector.
         // Otherwise, discard one code point if it's a variation selector.
-        if (next_code_point.has_value() && Unicode::code_point_has_property(*next_code_point, *variation_selector))
+        if (next_code_point.has_value() && Unicode::code_point_has_variation_selector_property(*next_code_point))
             ++it;
             ++it;
     };
     };
 
 

+ 2 - 4
Userland/Libraries/LibJS/Lexer.cpp

@@ -468,8 +468,7 @@ Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
     if (is_ascii(code_point))
     if (is_ascii(code_point))
         return {};
         return {};
 
 
-    static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
-    if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
+    if (Unicode::code_point_has_identifier_start_property(code_point))
         return code_point;
         return code_point;
 
 
     return {};
     return {};
@@ -503,8 +502,7 @@ Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
     if (is_ascii(code_point))
     if (is_ascii(code_point))
         return {};
         return {};
 
 
-    static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
-    if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
+    if (Unicode::code_point_has_identifier_continue_property(code_point))
         return code_point;
         return code_point;
 
 
     return {};
     return {};

+ 6 - 8
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -1644,7 +1644,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
                 compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
                 compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
             property.visit(
             property.visit(
                 [&](Unicode::Property property) {
                 [&](Unicode::Property property) {
-                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property });
+                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
                 },
                 },
                 [&](Unicode::GeneralCategory general_category) {
                 [&](Unicode::GeneralCategory general_category) {
                     compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
                     compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
@@ -1996,7 +1996,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
 
 
             if (atom.is_property)
             if (atom.is_property)
-                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
+                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property.value()) });
             else if (atom.is_general_category)
             else if (atom.is_general_category)
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
             else if (atom.is_script)
             else if (atom.is_script)
@@ -2332,7 +2332,7 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
             compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
             compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
         property.visit(
         property.visit(
             [&](Unicode::Property property) {
             [&](Unicode::Property property) {
-                compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property });
+                compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
             },
             },
             [&](Unicode::GeneralCategory general_category) {
             [&](Unicode::GeneralCategory general_category) {
                 compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
                 compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
@@ -2419,7 +2419,7 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
                 compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
                 compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
             property.visit(
             property.visit(
                 [&](Unicode::Property property) {
                 [&](Unicode::Property property) {
-                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property });
+                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
                 },
                 },
                 [&](Unicode::GeneralCategory general_category) {
                 [&](Unicode::GeneralCategory general_category) {
                     compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
                     compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
@@ -2476,8 +2476,6 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
 
 
 DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
 DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
 {
 {
-    static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
-    static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
     static constexpr u32 const REPLACEMENT_CHARACTER = 0xFFFD;
     static constexpr u32 const REPLACEMENT_CHARACTER = 0xFFFD;
     constexpr u32 const ZERO_WIDTH_NON_JOINER { 0x200C };
     constexpr u32 const ZERO_WIDTH_NON_JOINER { 0x200C };
     constexpr u32 const ZERO_WIDTH_JOINER { 0x200D };
     constexpr u32 const ZERO_WIDTH_JOINER { 0x200D };
@@ -2526,7 +2524,7 @@ DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starti
                 set_error(Error::InvalidNameForCaptureGroup);
                 set_error(Error::InvalidNameForCaptureGroup);
                 return {};
                 return {};
             }
             }
-        } else if (id_start_category.has_value() && !Unicode::code_point_has_property(code_point, *id_start_category)) {
+        } else if (!Unicode::code_point_has_identifier_start_property(code_point)) {
             set_error(Error::InvalidNameForCaptureGroup);
             set_error(Error::InvalidNameForCaptureGroup);
             return {};
             return {};
         }
         }
@@ -2569,7 +2567,7 @@ DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starti
                 return {};
                 return {};
             }
             }
         } else if (code_point != ZERO_WIDTH_JOINER && code_point != ZERO_WIDTH_NON_JOINER) {
         } else if (code_point != ZERO_WIDTH_JOINER && code_point != ZERO_WIDTH_NON_JOINER) {
-            if (id_continue_category.has_value() && !Unicode::code_point_has_property(code_point, *id_continue_category)) {
+            if (!Unicode::code_point_has_identifier_continue_property(code_point)) {
                 set_error(Error::InvalidNameForCaptureGroup);
                 set_error(Error::InvalidNameForCaptureGroup);
                 return {};
                 return {};
             }
             }

+ 167 - 65
Userland/Libraries/LibUnicode/CharacterTypes.cpp

@@ -4,84 +4,186 @@
  * SPDX-License-Identifier: BSD-2-Clause
  * SPDX-License-Identifier: BSD-2-Clause
  */
  */
 
 
+#include <AK/Array.h>
+#include <AK/CharacterTypes.h>
+#include <AK/Find.h>
+#include <AK/Traits.h>
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/CharacterTypes.h>
 
 
-#if ENABLE_UNICODE_DATA
-#    include <LibUnicode/UnicodeData.h>
-#endif
+#include <unicode/uchar.h>
+
+namespace Unicode {
+
+template<typename PropertyType>
+struct PropertyName {
+    Optional<StringView> long_name;
+    Optional<StringView> short_name;
+    Optional<StringView> additional_name;
+};
+
+// From uchar.h:
+// Unicode allows for additional names, beyond the long and short name, which would be indicated by U_LONG_PROPERTY_NAME + i
+static constexpr auto ADDITIONAL_NAME = static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + 1);
+
+}
+
+template<typename PropertyType>
+struct AK::Traits<Unicode::PropertyName<PropertyType>> {
+    static constexpr bool equals(Unicode::PropertyName<PropertyType> const& candidate, StringView property)
+    {
+        return property == candidate.long_name || property == candidate.short_name || property == candidate.additional_name;
+    }
+};
 
 
 namespace Unicode {
 namespace Unicode {
 
 
 Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
 Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
 bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
 bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
-Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
-bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }
 
 
-bool is_ecma262_property([[maybe_unused]] Property property)
+static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1;
+static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2;
+static constexpr Property PROPERTY_ASSIGNED = UCHAR_BINARY_LIMIT + 3;
+static constexpr Property PROPERTY_LIMIT = UCHAR_BINARY_LIMIT + 4;
+
+Optional<Property> property_from_string(StringView property)
 {
 {
-#if ENABLE_UNICODE_DATA
-    // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
-    switch (property) {
-    case Unicode::Property::ASCII:
-    case Unicode::Property::ASCII_Hex_Digit:
-    case Unicode::Property::Alphabetic:
-    case Unicode::Property::Any:
-    case Unicode::Property::Assigned:
-    case Unicode::Property::Bidi_Control:
-    case Unicode::Property::Bidi_Mirrored:
-    case Unicode::Property::Case_Ignorable:
-    case Unicode::Property::Cased:
-    case Unicode::Property::Changes_When_Casefolded:
-    case Unicode::Property::Changes_When_Casemapped:
-    case Unicode::Property::Changes_When_Lowercased:
-    case Unicode::Property::Changes_When_NFKC_Casefolded:
-    case Unicode::Property::Changes_When_Titlecased:
-    case Unicode::Property::Changes_When_Uppercased:
-    case Unicode::Property::Dash:
-    case Unicode::Property::Default_Ignorable_Code_Point:
-    case Unicode::Property::Deprecated:
-    case Unicode::Property::Diacritic:
-    case Unicode::Property::Emoji:
-    case Unicode::Property::Emoji_Component:
-    case Unicode::Property::Emoji_Modifier:
-    case Unicode::Property::Emoji_Modifier_Base:
-    case Unicode::Property::Emoji_Presentation:
-    case Unicode::Property::Extended_Pictographic:
-    case Unicode::Property::Extender:
-    case Unicode::Property::Grapheme_Base:
-    case Unicode::Property::Grapheme_Extend:
-    case Unicode::Property::Hex_Digit:
-    case Unicode::Property::IDS_Binary_Operator:
-    case Unicode::Property::IDS_Trinary_Operator:
-    case Unicode::Property::ID_Continue:
-    case Unicode::Property::ID_Start:
-    case Unicode::Property::Ideographic:
-    case Unicode::Property::Join_Control:
-    case Unicode::Property::Logical_Order_Exception:
-    case Unicode::Property::Lowercase:
-    case Unicode::Property::Math:
-    case Unicode::Property::Noncharacter_Code_Point:
-    case Unicode::Property::Pattern_Syntax:
-    case Unicode::Property::Pattern_White_Space:
-    case Unicode::Property::Quotation_Mark:
-    case Unicode::Property::Radical:
-    case Unicode::Property::Regional_Indicator:
-    case Unicode::Property::Sentence_Terminal:
-    case Unicode::Property::Soft_Dotted:
-    case Unicode::Property::Terminal_Punctuation:
-    case Unicode::Property::Unified_Ideograph:
-    case Unicode::Property::Uppercase:
-    case Unicode::Property::Variation_Selector:
-    case Unicode::Property::White_Space:
-    case Unicode::Property::XID_Continue:
-    case Unicode::Property::XID_Start:
+    static auto property_names = []() {
+        Array<PropertyName<Property>, PROPERTY_LIMIT.value()> names;
+
+        for (Property property = 0; property < UCHAR_BINARY_LIMIT; ++property) {
+            auto icu_property = static_cast<UProperty>(property.value());
+
+            if (char const* name = u_getPropertyName(icu_property, U_LONG_PROPERTY_NAME))
+                names[property.value()].long_name = StringView { name, strlen(name) };
+            if (char const* name = u_getPropertyName(icu_property, U_SHORT_PROPERTY_NAME))
+                names[property.value()].short_name = StringView { name, strlen(name) };
+            if (char const* name = u_getPropertyName(icu_property, ADDITIONAL_NAME))
+                names[property.value()].additional_name = StringView { name, strlen(name) };
+        }
+
+        names[PROPERTY_ANY.value()] = { "Any"sv, {}, {} };
+        names[PROPERTY_ASCII.value()] = { "ASCII"sv, {}, {} };
+        names[PROPERTY_ASSIGNED.value()] = { "Assigned"sv, {}, {} };
+
+        return names;
+    }();
+
+    if (auto index = find_index(property_names.begin(), property_names.end(), property); index != property_names.size())
+        return static_cast<Property>(index);
+    return {};
+}
+
+bool code_point_has_property(u32 code_point, Property property)
+{
+    auto icu_code_point = static_cast<UChar32>(code_point);
+    auto icu_property = static_cast<UProperty>(property.value());
+
+    if (property == PROPERTY_ANY)
+        return is_unicode(code_point);
+    if (property == PROPERTY_ASCII)
+        return is_ascii(code_point);
+    if (property == PROPERTY_ASSIGNED)
+        return u_isdefined(icu_code_point);
+
+    return static_cast<bool>(u_hasBinaryProperty(icu_code_point, icu_property));
+}
+
+bool code_point_has_emoji_property(u32 code_point)
+{
+    return code_point_has_property(code_point, UCHAR_EMOJI);
+}
+
+bool code_point_has_emoji_modifier_base_property(u32 code_point)
+{
+    return code_point_has_property(code_point, UCHAR_EMOJI_MODIFIER_BASE);
+}
+
+bool code_point_has_emoji_presentation_property(u32 code_point)
+{
+    return code_point_has_property(code_point, UCHAR_EMOJI_PRESENTATION);
+}
+
+bool code_point_has_identifier_start_property(u32 code_point)
+{
+    return u_isIDStart(static_cast<UChar32>(code_point));
+}
+
+bool code_point_has_identifier_continue_property(u32 code_point)
+{
+    return u_isIDPart(static_cast<UChar32>(code_point));
+}
+
+bool code_point_has_regional_indicator_property(u32 code_point)
+{
+    return code_point_has_property(code_point, UCHAR_REGIONAL_INDICATOR);
+}
+
+bool code_point_has_variation_selector_property(u32 code_point)
+{
+    return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR);
+}
+
+// https://tc39.es/ecma262/#table-binary-unicode-properties
+bool is_ecma262_property(Property property)
+{
+    if (property == PROPERTY_ANY || property == PROPERTY_ASCII || property == PROPERTY_ASSIGNED)
+        return true;
+
+    switch (property.value()) {
+    case UCHAR_ASCII_HEX_DIGIT:
+    case UCHAR_ALPHABETIC:
+    case UCHAR_BIDI_CONTROL:
+    case UCHAR_BIDI_MIRRORED:
+    case UCHAR_CASE_IGNORABLE:
+    case UCHAR_CASED:
+    case UCHAR_CHANGES_WHEN_CASEFOLDED:
+    case UCHAR_CHANGES_WHEN_CASEMAPPED:
+    case UCHAR_CHANGES_WHEN_LOWERCASED:
+    case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
+    case UCHAR_CHANGES_WHEN_TITLECASED:
+    case UCHAR_CHANGES_WHEN_UPPERCASED:
+    case UCHAR_DASH:
+    case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
+    case UCHAR_DEPRECATED:
+    case UCHAR_DIACRITIC:
+    case UCHAR_EMOJI:
+    case UCHAR_EMOJI_COMPONENT:
+    case UCHAR_EMOJI_MODIFIER:
+    case UCHAR_EMOJI_MODIFIER_BASE:
+    case UCHAR_EMOJI_PRESENTATION:
+    case UCHAR_EXTENDED_PICTOGRAPHIC:
+    case UCHAR_EXTENDER:
+    case UCHAR_GRAPHEME_BASE:
+    case UCHAR_GRAPHEME_EXTEND:
+    case UCHAR_HEX_DIGIT:
+    case UCHAR_IDS_BINARY_OPERATOR:
+    case UCHAR_IDS_TRINARY_OPERATOR:
+    case UCHAR_ID_CONTINUE:
+    case UCHAR_ID_START:
+    case UCHAR_IDEOGRAPHIC:
+    case UCHAR_JOIN_CONTROL:
+    case UCHAR_LOGICAL_ORDER_EXCEPTION:
+    case UCHAR_LOWERCASE:
+    case UCHAR_MATH:
+    case UCHAR_NONCHARACTER_CODE_POINT:
+    case UCHAR_PATTERN_SYNTAX:
+    case UCHAR_PATTERN_WHITE_SPACE:
+    case UCHAR_QUOTATION_MARK:
+    case UCHAR_RADICAL:
+    case UCHAR_REGIONAL_INDICATOR:
+    case UCHAR_S_TERM:
+    case UCHAR_SOFT_DOTTED:
+    case UCHAR_TERMINAL_PUNCTUATION:
+    case UCHAR_UNIFIED_IDEOGRAPH:
+    case UCHAR_UPPERCASE:
+    case UCHAR_VARIATION_SELECTOR:
+    case UCHAR_WHITE_SPACE:
+    case UCHAR_XID_CONTINUE:
+    case UCHAR_XID_START:
         return true;
         return true;
     default:
     default:
         return false;
         return false;
     }
     }
-#else
-    return false;
-#endif
 }
 }
 
 
 Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }
 Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {}; }

+ 9 - 0
Userland/Libraries/LibUnicode/CharacterTypes.h

@@ -31,6 +31,15 @@ bool code_point_has_general_category(u32 code_point, GeneralCategory general_cat
 
 
 Optional<Property> property_from_string(StringView);
 Optional<Property> property_from_string(StringView);
 bool code_point_has_property(u32 code_point, Property property);
 bool code_point_has_property(u32 code_point, Property property);
+
+bool code_point_has_emoji_property(u32 code_point);
+bool code_point_has_emoji_modifier_base_property(u32 code_point);
+bool code_point_has_emoji_presentation_property(u32 code_point);
+bool code_point_has_identifier_start_property(u32 code_point);
+bool code_point_has_identifier_continue_property(u32 code_point);
+bool code_point_has_regional_indicator_property(u32 code_point);
+bool code_point_has_variation_selector_property(u32 code_point);
+
 bool is_ecma262_property(Property);
 bool is_ecma262_property(Property);
 
 
 Optional<Script> script_from_string(StringView);
 Optional<Script> script_from_string(StringView);

+ 6 - 18
Userland/Libraries/LibUnicode/Emoji.cpp

@@ -10,16 +10,10 @@
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/Emoji.h>
 #include <LibUnicode/Emoji.h>
 
 
-#if ENABLE_UNICODE_DATA
-#    include <LibUnicode/UnicodeData.h>
-#endif
-
 namespace Unicode {
 namespace Unicode {
 
 
 Optional<Emoji> __attribute__((weak)) find_emoji_for_code_points(ReadonlySpan<u32>) { return {}; }
 Optional<Emoji> __attribute__((weak)) find_emoji_for_code_points(ReadonlySpan<u32>) { return {}; }
 
 
-#if ENABLE_UNICODE_DATA
-
 // https://unicode.org/reports/tr51/#def_emoji_core_sequence
 // https://unicode.org/reports/tr51/#def_emoji_core_sequence
 static bool could_be_start_of_emoji_core_sequence(u32 code_point, Optional<u32> const& next_code_point, SequenceType type)
 static bool could_be_start_of_emoji_core_sequence(u32 code_point, Optional<u32> const& next_code_point, SequenceType type)
 {
 {
@@ -41,13 +35,13 @@ static bool could_be_start_of_emoji_core_sequence(u32 code_point, Optional<u32>
     // https://unicode.org/reports/tr51/#def_emoji_character
     // https://unicode.org/reports/tr51/#def_emoji_character
     switch (type) {
     switch (type) {
     case SequenceType::Any:
     case SequenceType::Any:
-        if (code_point_has_property(code_point, Property::Emoji))
+        if (code_point_has_emoji_property(code_point))
             return true;
             return true;
         break;
         break;
     case SequenceType::EmojiPresentation:
     case SequenceType::EmojiPresentation:
-        if (code_point_has_property(code_point, Property::Emoji_Presentation))
+        if (code_point_has_emoji_presentation_property(code_point))
             return true;
             return true;
-        if (next_code_point == zero_width_joiner && code_point_has_property(code_point, Property::Emoji))
+        if (next_code_point == zero_width_joiner && code_point_has_emoji_property(code_point))
             return true;
             return true;
         break;
         break;
     }
     }
@@ -59,12 +53,12 @@ static bool could_be_start_of_emoji_core_sequence(u32 code_point, Optional<u32>
 
 
     // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
     // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
     // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
     // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
-    if (code_point_has_property(code_point, Property::Emoji_Modifier_Base))
+    if (code_point_has_emoji_modifier_base_property(code_point))
         return true;
         return true;
 
 
     // https://unicode.org/reports/tr51/#def_emoji_flag_sequence
     // https://unicode.org/reports/tr51/#def_emoji_flag_sequence
     // emoji_flag_sequence := regional_indicator regional_indicator
     // emoji_flag_sequence := regional_indicator regional_indicator
-    if (code_point_has_property(code_point, Property::Regional_Indicator))
+    if (code_point_has_regional_indicator_property(code_point))
         return true;
         return true;
 
 
     return false;
     return false;
@@ -78,18 +72,15 @@ static bool could_be_start_of_serenity_emoji(u32 code_point)
     return code_point >= first_custom_serenity_emoji_code_point;
     return code_point >= first_custom_serenity_emoji_code_point;
 }
 }
 
 
-#endif
-
 // https://unicode.org/reports/tr51/#def_emoji_sequence
 // https://unicode.org/reports/tr51/#def_emoji_sequence
 template<typename CodePointIterator>
 template<typename CodePointIterator>
-static bool could_be_start_of_emoji_sequence_impl(CodePointIterator const& it, [[maybe_unused]] SequenceType type)
+static bool could_be_start_of_emoji_sequence_impl(CodePointIterator const& it, SequenceType type)
 {
 {
     // emoji_sequence := emoji_core_sequence | emoji_zwj_sequence | emoji_tag_sequence
     // emoji_sequence := emoji_core_sequence | emoji_zwj_sequence | emoji_tag_sequence
 
 
     if (it.done())
     if (it.done())
         return false;
         return false;
 
 
-#if ENABLE_UNICODE_DATA
     // The purpose of this method is to quickly filter out code points that cannot be the start of
     // The purpose of this method is to quickly filter out code points that cannot be the start of
     // an emoji. The emoji_core_sequence definition alone captures the start of all possible
     // an emoji. The emoji_core_sequence definition alone captures the start of all possible
     // emoji_zwj_sequence and emoji_tag_sequence emojis, because:
     // emoji_zwj_sequence and emoji_tag_sequence emojis, because:
@@ -108,9 +99,6 @@ static bool could_be_start_of_emoji_sequence_impl(CodePointIterator const& it, [
     if (could_be_start_of_serenity_emoji(code_point))
     if (could_be_start_of_serenity_emoji(code_point))
         return true;
         return true;
     return false;
     return false;
-#else
-    return true;
-#endif
 }
 }
 
 
 bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const& it, SequenceType type)
 bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const& it, SequenceType type)

+ 3 - 1
Userland/Libraries/LibUnicode/Forward.h

@@ -6,6 +6,7 @@
 
 
 #pragma once
 #pragma once
 
 
+#include <AK/DistinctNumeric.h>
 #include <AK/Types.h>
 #include <AK/Types.h>
 
 
 namespace Unicode {
 namespace Unicode {
@@ -13,10 +14,11 @@ namespace Unicode {
 enum class BidirectionalClass : u8;
 enum class BidirectionalClass : u8;
 enum class EmojiGroup : u8;
 enum class EmojiGroup : u8;
 enum class GeneralCategory : u8;
 enum class GeneralCategory : u8;
-enum class Property : u8;
 enum class Script : u8;
 enum class Script : u8;
 
 
 struct CurrencyCode;
 struct CurrencyCode;
 struct Emoji;
 struct Emoji;
 
 
+AK_TYPEDEF_DISTINCT_NUMERIC_GENERAL(u32, Property, CastToUnderlying, Comparison, Increment);
+
 }
 }