Selaa lähdekoodia

LibUnicode: Parse UCD Scripts.txt and generate as a Unicode property

There are a couple of minor nuances with parsing script values, compared
to other properties. In Scripts.txt, the UCD file lists the full name of
each script; other properties, like General Category, list the shorter
name in their primary files. This means that the aliases listed in
PropertyValueAliases.txt are reversed for script values.
Timothy Flynn 4 vuotta sitten
vanhempi
commit
f5c1bbc00b

+ 22 - 0
Userland/Libraries/LibUnicode/CharacterTypes.cpp

@@ -318,4 +318,26 @@ bool is_ecma262_property([[maybe_unused]] Property property)
 #endif
 }
 
+Optional<Script> script_from_string([[maybe_unused]] StringView const& script)
+{
+#if ENABLE_UNICODE_DATA
+    return Detail::script_from_string(script);
+#else
+    return {};
+#endif
+}
+
+bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
+{
+#if ENABLE_UNICODE_DATA
+    auto unicode_data = Detail::unicode_data_for_code_point(code_point);
+    if (!unicode_data.has_value())
+        return false;
+
+    return unicode_data->script == script;
+#else
+    return false;
+#endif
+}
+
 }

+ 3 - 0
Userland/Libraries/LibUnicode/CharacterTypes.h

@@ -28,4 +28,7 @@ Optional<Property> property_from_string(StringView const&);
 bool code_point_has_property(u32 code_point, Property property);
 bool is_ecma262_property(Property);
 
+Optional<Script> script_from_string(StringView const&);
+bool code_point_has_script(u32 code_point, Script script);
+
 }

+ 77 - 27
Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp

@@ -71,6 +71,7 @@ struct CodePointData {
     Optional<u32> simple_titlecase_mapping;
     Vector<u32> special_casing_indices;
     Vector<StringView> prop_list;
+    StringView script;
     StringView word_break_property;
 };
 
@@ -112,6 +113,11 @@ struct UnicodeData {
     };
     Vector<Alias> prop_aliases;
 
+    PropList script_list {
+        { "Unknown"sv, {} },
+    };
+    Vector<Alias> script_aliases;
+
     PropList word_break_prop_list;
 };
 
@@ -267,9 +273,15 @@ static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector
     }
 }
 
-static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector<String> const& value_list, Vector<Alias>& prop_unions, Vector<Alias>& prop_aliases)
+static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector<String> const& value_list, Vector<Alias> const& prop_unions, Vector<Alias>& prop_aliases, bool primary_value_is_first = true)
 {
+    VERIFY(file.seek(0));
+
     auto append_alias = [&](auto alias, auto value) {
+        // Note: The value alias file contains lines such as "Ahom = Ahom", which we should just skip.
+        if (alias == value)
+            return;
+
         // FIXME: We will, eventually, need to find where missing properties are located and parse them.
         if (!value_list.contains_slow(value) && !any_of(prop_unions, [&](auto const& u) { return value == u.alias; }))
             return;
@@ -292,8 +304,8 @@ static void parse_value_alias_list(Core::File& file, StringView desired_category
             continue;
 
         VERIFY((segments.size() == 3) || (segments.size() == 4));
-        auto value = segments[1].trim_whitespace();
-        auto alias = segments[2].trim_whitespace();
+        auto value = primary_value_is_first ? segments[1].trim_whitespace() : segments[2].trim_whitespace();
+        auto alias = primary_value_is_first ? segments[2].trim_whitespace() : segments[1].trim_whitespace();
         append_alias(alias, value);
 
         if (segments.size() == 4) {
@@ -307,6 +319,34 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
 {
     Optional<u32> code_point_range_start;
 
+    auto assign_code_point_property = [&](u32 code_point, auto const& list, auto& property, StringView default_) {
+        using PropertyType = RemoveCVReference<decltype(property)>;
+        constexpr bool is_single_item = IsSame<PropertyType, StringView>;
+
+        auto assign_property = [&](auto const& item) {
+            if constexpr (is_single_item)
+                property = item;
+            else
+                property.append(item);
+        };
+
+        for (auto const& item : list) {
+            for (auto const& range : item.value) {
+                if ((range.first <= code_point) && (code_point <= range.last)) {
+                    assign_property(item.key);
+                    break;
+                }
+            }
+            if constexpr (is_single_item) {
+                if (!property.is_empty())
+                    break;
+            }
+        }
+
+        if (property.is_empty())
+            assign_property(default_);
+    };
+
     while (file.can_read_line()) {
         auto line = file.read_line();
         if (line.is_empty())
@@ -351,29 +391,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
                 data.special_casing_indices.append(casing.index);
         }
 
-        for (auto const& property : unicode_data.prop_list) {
-            for (auto const& range : property.value) {
-                if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
-                    data.prop_list.append(property.key);
-                    break;
-                }
-            }
-        }
-        if (data.prop_list.is_empty())
-            data.prop_list.append("Assigned"sv);
-
-        for (auto const& property : unicode_data.word_break_prop_list) {
-            for (auto const& range : property.value) {
-                if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
-                    data.word_break_property = property.key;
-                    break;
-                }
-            }
-            if (!data.word_break_property.is_empty())
-                break;
-        }
-        if (data.word_break_property.is_empty())
-            data.word_break_property = "Other"sv;
+        assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
+        assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
+        assign_code_point_property(data.code_point, unicode_data.word_break_prop_list, data.word_break_property, "Other"sv);
 
         unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
 
@@ -392,7 +412,7 @@ static void generate_unicode_data_header(Core::File& file, UnicodeData& unicode_
     generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
 
     auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> unions = {}, Vector<Alias> aliases = {}, bool as_bitmask = false) {
-        VERIFY((values.size() + !default_.is_empty()) <= 64);
+        VERIFY(!as_bitmask || (values.size() <= 64));
         quick_sort(values);
         quick_sort(unions, [](auto& union1, auto& union2) { return union1.alias < union2.alias; });
         quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
@@ -476,6 +496,7 @@ namespace Unicode {
     generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
     generate_enum("GeneralCategory"sv, "None"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases, true);
     generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), {}, unicode_data.prop_aliases, true);
+    generate_enum("Script"sv, {}, unicode_data.script_list.keys(), {}, unicode_data.script_aliases);
     generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
 
     generator.append(R"~~~(
@@ -530,6 +551,7 @@ struct UnicodeData {
     u32 special_casing_size { 0 };
 
     Property properties { Property::Assigned };
+    Script script { Script::Unknown };
     WordBreakProperty word_break_property { WordBreakProperty::Other };
 };
 
@@ -538,6 +560,7 @@ namespace Detail {
 Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
 Optional<Property> property_from_string(StringView const& property);
 Optional<GeneralCategory> general_category_from_string(StringView const& general_category);
+Optional<Script> script_from_string(StringView const& script);
 
 }
 
@@ -644,6 +667,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
             first = false;
         }
 
+        generator.append(String::formatted(", Script::{}", data.script));
         generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
         generator.append(" },");
     }
@@ -746,6 +770,26 @@ Optional<GeneralCategory> general_category_from_string(StringView const& general
     return {};
 }
 
+Optional<Script> script_from_string(StringView const& script)
+{)~~~");
+
+    for (auto const& script : unicode_data.script_list) {
+        generator.set("script", script.key);
+        generator.append(R"~~~(
+    if (script == "@script@"sv)
+        return Script::@script@;)~~~");
+    }
+    for (auto const& alias : unicode_data.script_aliases) {
+        generator.set("script", alias.alias);
+        generator.append(R"~~~(
+    if (script == "@script@"sv)
+        return Script::@script@;)~~~");
+    }
+
+    generator.append(R"~~~(
+    return {};
+}
+
 }
 
 }
@@ -764,6 +808,7 @@ int main(int argc, char** argv)
     char const* derived_core_prop_path = nullptr;
     char const* prop_alias_path = nullptr;
     char const* prop_value_alias_path = nullptr;
+    char const* scripts_path = nullptr;
     char const* word_break_path = nullptr;
 
     Core::ArgsParser args_parser;
@@ -775,6 +820,7 @@ int main(int argc, char** argv)
     args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
     args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
     args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
+    args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
     args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
     args_parser.parse(argc, argv);
 
@@ -802,6 +848,7 @@ int main(int argc, char** argv)
     auto derived_core_prop_file = open_file(derived_core_prop_path, "-d/--derived-core-prop-path");
     auto prop_alias_file = open_file(prop_alias_path, "-a/--prop-alias-path");
     auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
+    auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
     auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
 
     UnicodeData unicode_data {};
@@ -809,9 +856,12 @@ int main(int argc, char** argv)
     parse_prop_list(prop_list_file, unicode_data.prop_list);
     parse_prop_list(derived_core_prop_file, unicode_data.prop_list);
     parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
+    parse_prop_list(scripts_file, unicode_data.script_list);
     parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
+
     parse_unicode_data(unicode_data_file, unicode_data);
     parse_value_alias_list(prop_value_alias_file, "gc"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases);
+    parse_value_alias_list(prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), {}, unicode_data.script_aliases, false);
 
     generate_unicode_data_header(generated_header_file, unicode_data);
     generate_unicode_data_implementation(generated_implementation_file, unicode_data);

+ 1 - 0
Userland/Libraries/LibUnicode/Forward.h

@@ -14,6 +14,7 @@ enum class Condition;
 enum class GeneralCategory : u64;
 enum class Locale;
 enum class Property : u64;
+enum class Script;
 enum class WordBreakProperty;
 
 struct SpecialCasing;

+ 9 - 2
Userland/Libraries/LibUnicode/unicode_data.cmake

@@ -18,6 +18,9 @@ set(PROP_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyAliases.txt)
 set(PROP_VALUE_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt)
 set(PROP_VALUE_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyValueAliases.txt)
 
+set(SCRIPTS_URL https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt)
+set(SCRIPTS_PATH ${CMAKE_BINARY_DIR}/UCD/Scripts.txt)
+
 set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
 set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
 
@@ -46,6 +49,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
         message(STATUS "Downloading UCD PropertyValueAliases.txt from ${PROP_VALUE_ALIAS_URL}...")
         file(DOWNLOAD ${PROP_VALUE_ALIAS_URL} ${PROP_VALUE_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
     endif()
+    if (NOT EXISTS ${SCRIPTS_PATH})
+        message(STATUS "Downloading UCD Scripts.txt from ${SCRIPTS_URL}...")
+        file(DOWNLOAD ${SCRIPTS_URL} ${SCRIPTS_PATH} INACTIVITY_TIMEOUT 10)
+    endif()
     if (NOT EXISTS ${WORD_BREAK_PATH})
         message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
         file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
@@ -61,9 +68,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
 
     add_custom_command(
         OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
-        COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -w ${WORD_BREAK_PATH}
+        COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -w ${WORD_BREAK_PATH}
         VERBATIM
-        DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${WORD_BREAK_PATH}
+        DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${WORD_BREAK_PATH}
     )
 
     set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})