3 years ago · a98d3a1a85
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -224,8 +224,6 @@ bool is_ecma262_property([[maybe_unused]] Property property)
 
															 {
														
 
															 #if ENABLE_UNICODE_DATA
														
 
															     // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
														
 
															-    // Note: Some of the properties in the above link are not yet parsed by the LibUnicode generator. They are left
														
 
															-    //       commented out here until they are parsed and can be used.
														
 
															     switch (property) {
														
 
															     case Unicode::Property::ASCII:
														
 
															     case Unicode::Property::ASCII_Hex_Digit:
														
@@ -239,7 +237,7 @@ bool is_ecma262_property([[maybe_unused]] Property property)
 
															     case Unicode::Property::Changes_When_Casefolded:
														
 
															     case Unicode::Property::Changes_When_Casemapped:
														
 
															     case Unicode::Property::Changes_When_Lowercased:
														
 
															-    // case Unicode::Property::Changes_When_NFKC_Casefolded:
														
 
															+    case Unicode::Property::Changes_When_NFKC_Casefolded:
														
 
															     case Unicode::Property::Changes_When_Titlecased:
														
 
															     case Unicode::Property::Changes_When_Uppercased:
														
 
															     case Unicode::Property::Dash:
														
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -50,6 +50,22 @@ struct Alias {
 
															     String alias;
														
 
															 };
														
 
															+// Normalization source: https://www.unicode.org/Public/13.0.0/ucd/DerivedNormalizationProps.txt
														
 
															+// Normalization descriptions: https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt
														
 
															+enum class QuickCheck {
														
 
															+    Yes,
														
 
															+    No,
														
 
															+    Maybe,
														
 
															+};
														
 
															+
														
 
															+struct Normalization {
														
 
															+    CodePointRange code_point_range;
														
 
															+    Vector<u32> value;
														
 
															+    QuickCheck quick_check { QuickCheck::Yes };
														
 
															+};
														
 
															+
														
 
															+using NormalizationProps = HashMap<String, Vector<Normalization>>;
														
 
															+
														
 
															 // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
														
 
															 // Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#UnicodeData.txt
														
 
															 //                     https://www.unicode.org/reports/tr44/#General_Category_Values
														
@@ -99,6 +115,9 @@ struct UnicodeData {
 
															     };
														
 
															     Vector<Alias> script_aliases;
														
 
															     PropList script_extensions;
														
 
															+
														
 
															+    // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
														
 
															+    NormalizationProps normalization_props;
														
 
															 };
														
 
															 static constexpr auto s_desired_fields = Array {
														
@@ -118,18 +137,38 @@ static void write_to_file_if_different(Core::File& file, StringView contents)
 
															     VERIFY(file.write(contents));
														
 
															 }
														
 
															-static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
														
 
															+static Vector<u32> parse_code_point_list(StringView const& list)
														
 
															 {
														
 
															-    auto parse_code_point_list = [&](auto const& line) {
														
 
															-        Vector<u32> code_points;
														
 
															+    Vector<u32> code_points;
														
 
															-        auto segments = line.split(' ');
														
 
															-        for (auto const& code_point : segments)
														
 
															-            code_points.append(AK::StringUtils::convert_to_uint_from_hex<u32>(code_point).value());
														
 
															+    auto segments = list.split_view(' ');
														
 
															+    for (auto const& code_point : segments)
														
 
															+        code_points.append(AK::StringUtils::convert_to_uint_from_hex<u32>(code_point).value());
														
 
															-        return code_points;
														
 
															-    };
														
 
															+    return code_points;
														
 
															+}
														
 
															+
														
 
															+static CodePointRange parse_code_point_range(StringView const& list)
														
 
															+{
														
 
															+    CodePointRange code_point_range {};
														
 
															+    if (list.contains(".."sv)) {
														
 
															+        auto segments = list.split_view(".."sv);
														
 
															+        VERIFY(segments.size() == 2);
														
 
															+
														
 
															+        auto begin = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
														
 
															+        auto end = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[1]).value();
														
 
															+        code_point_range = { begin, end };
														
 
															+    } else {
														
 
															+        auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(list).value();
														
 
															+        code_point_range = { code_point, code_point };
														
 
															+    }
														
 
															+
														
 
															+    return code_point_range;
														
 
															+}
														
 
															+
														
 
															+static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
														
 
															+{
														
 
															     while (file.can_read_line()) {
														
 
															         auto line = file.read_line();
														
 
															         if (line.is_empty() || line.starts_with('#'))
														
@@ -191,7 +230,7 @@ static void parse_prop_list(Core::File& file, PropList& prop_list, bool multi_va
 
															         auto segments = line.split_view(';', true);
														
 
															         VERIFY(segments.size() == 2);
														
 
															-        auto code_point_range = segments[0].trim_whitespace();
														
 
															+        auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
														
 
															         Vector<StringView> properties;
														
 
															         if (multi_value_property)
														
@@ -201,18 +240,7 @@ static void parse_prop_list(Core::File& file, PropList& prop_list, bool multi_va
 
															         for (auto const& property : properties) {
														
 
															             auto& code_points = prop_list.ensure(property.trim_whitespace());
														
 
															-
														
 
															-            if (code_point_range.contains(".."sv)) {
														
 
															-                segments = code_point_range.split_view(".."sv);
														
 
															-                VERIFY(segments.size() == 2);
														
 
															-
														
 
															-                auto begin = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
														
 
															-                auto end = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[1]).value();
														
 
															-                code_points.append({ begin, end });
														
 
															-            } else {
														
 
															-                auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(code_point_range).value();
														
 
															-                code_points.append({ code_point, code_point });
														
 
															-            }
														
 
															+            code_points.append(code_point_range);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -301,6 +329,44 @@ static void parse_value_alias_list(Core::File& file, StringView desired_category
 
															     }
														
 
															 }
														
 
															+static void parse_normalization_props(Core::File& file, UnicodeData& unicode_data)
														
 
															+{
														
 
															+    while (file.can_read_line()) {
														
 
															+        auto line = file.read_line();
														
 
															+        if (line.is_empty() || line.starts_with('#'))
														
 
															+            continue;
														
 
															+
														
 
															+        if (auto index = line.find('#'); index.has_value())
														
 
															+            line = line.substring(0, *index);
														
 
															+
														
 
															+        auto segments = line.split_view(';', true);
														
 
															+        VERIFY((segments.size() == 2) || (segments.size() == 3));
														
 
															+
														
 
															+        auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
														
 
															+        auto property = segments[1].trim_whitespace().to_string();
														
 
															+
														
 
															+        Vector<u32> value;
														
 
															+        QuickCheck quick_check = QuickCheck::Yes;
														
 
															+
														
 
															+        if (segments.size() == 3) {
														
 
															+            auto value_or_quick_check = segments[2].trim_whitespace();
														
 
															+
														
 
															+            if ((value_or_quick_check == "N"sv))
														
 
															+                quick_check = QuickCheck::No;
														
 
															+            else if ((value_or_quick_check == "M"sv))
														
 
															+                quick_check = QuickCheck::Maybe;
														
 
															+            else
														
 
															+                value = parse_code_point_list(value_or_quick_check);
														
 
															+        }
														
 
															+
														
 
															+        auto& normalizations = unicode_data.normalization_props.ensure(property);
														
 
															+        normalizations.append({ code_point_range, move(value), quick_check });
														
 
															+
														
 
															+        auto& prop_list = unicode_data.prop_list.ensure(property);
														
 
															+        prop_list.append(move(code_point_range));
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
														
 
															 {
														
 
															     Optional<u32> code_point_range_start;
														
@@ -927,6 +993,7 @@ int main(int argc, char** argv)
 
															     char const* scripts_path = nullptr;
														
 
															     char const* script_extensions_path = nullptr;
														
 
															     char const* emoji_data_path = nullptr;
														
 
															+    char const* normalization_path = nullptr;
														
 
															     Core::ArgsParser args_parser;
														
 
															     args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
														
@@ -942,6 +1009,7 @@ int main(int argc, char** argv)
 
															     args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
														
 
															     args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
														
 
															     args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
														
 
															+    args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
														
 
															     args_parser.parse(argc, argv);
														
 
															     auto open_file = [&](StringView path, StringView flags, Core::OpenMode mode = Core::OpenMode::ReadOnly) {
														
@@ -973,6 +1041,7 @@ int main(int argc, char** argv)
 
															     auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
														
 
															     auto script_extensions_file = open_file(script_extensions_path, "-x/--script-extensions-path");
														
 
															     auto emoji_data_file = open_file(emoji_data_path, "-e/--emoji-data-path");
														
 
															+    auto normalization_file = open_file(normalization_path, "-n/--normalization-path");
														
 
															     UnicodeData unicode_data {};
														
 
															     parse_special_casing(special_casing_file, unicode_data);
														
@@ -981,6 +1050,7 @@ int main(int argc, char** argv)
 
															     parse_prop_list(derived_core_prop_file, unicode_data.prop_list);
														
 
															     parse_prop_list(derived_binary_prop_file, unicode_data.prop_list);
														
 
															     parse_prop_list(emoji_data_file, unicode_data.prop_list);
														
 
															+    parse_normalization_props(normalization_file, unicode_data);
														
 
															     parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
														
 
															     parse_prop_list(scripts_file, unicode_data.script_list);
														
 
															     parse_prop_list(script_extensions_file, unicode_data.script_extensions, true);
														
--- a/Userland/Libraries/LibUnicode/unicode_data.cmake
+++ b/Userland/Libraries/LibUnicode/unicode_data.cmake
@@ -33,6 +33,9 @@ set(SCRIPT_EXTENSIONS_PATH ${CMAKE_BINARY_DIR}/UCD/ScriptExtensions.txt)
 
															 set(EMOJI_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt)
														
 
															 set(EMOJI_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/emoji-data.txt)
														
 
															+set(NORM_PROPS_URL https://www.unicode.org/Public/13.0.0/ucd/DerivedNormalizationProps.txt)
														
 
															+set(NORM_PROPS_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedNormalizationProps.txt)
														
 
															+
														
 
															 if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
														
 
															     if (NOT EXISTS ${UNICODE_DATA_PATH})
														
 
															         message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...")
														
@@ -78,6 +81,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
 
															         message(STATUS "Downloading UCD emoji-data.txt from ${EMOJI_DATA_URL}...")
														
 
															         file(DOWNLOAD ${EMOJI_DATA_URL} ${EMOJI_DATA_PATH} INACTIVITY_TIMEOUT 10)
														
 
															     endif()
														
 
															+    if (NOT EXISTS ${NORM_PROPS_PATH})
														
 
															+        message(STATUS "Downloading UCD DerivedNormalizationProps.txt from ${NORM_PROPS_URL}...")
														
 
															+        file(DOWNLOAD ${NORM_PROPS_URL} ${NORM_PROPS_PATH} INACTIVITY_TIMEOUT 10)
														
 
															+    endif()
														
 
															     set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h)
														
 
															     set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp)
														
@@ -89,9 +96,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
 
															     add_custom_command(
														
 
															         OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
														
 
															-        COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -g ${DERIVED_GENERAL_CATEGORY_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH}
														
 
															+        COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -g ${DERIVED_GENERAL_CATEGORY_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH} -n ${NORM_PROPS_PATH}
														
 
															         VERBATIM
														
 
															-        DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${DERIVED_GENERAL_CATEGORY_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH}
														
 
															+        DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${DERIVED_GENERAL_CATEGORY_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH} ${NORM_PROPS_PATH}
														
 
															     )
														
 
															     set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})