4 éve · 5ac23d244d
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@@ -305,3 +305,73 @@ TEST_CASE(general_category)
 
				         EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
			
 
				     }
			
 
				 }
			
 
				+
			
 
				+TEST_CASE(property)
			
 
				+{
			
 
				+    auto property = [](StringView name) {
			
 
				+        auto property = Unicode::property_from_string(name);
			
 
				+        VERIFY(property.has_value());
			
 
				+        return *property;
			
 
				+    };
			
 
				+
			
 
				+    auto property_any = property("Any"sv);
			
 
				+    auto property_assigned = property("Assigned"sv);
			
 
				+    auto property_ascii = property("ASCII"sv);
			
 
				+
			
 
				+    auto property_white_space = property("White_Space"sv);
			
 
				+    auto property_wspace = property("WSpace"sv);
			
 
				+    auto property_space = property("space"sv);
			
 
				+    EXPECT_EQ(property_white_space, property_wspace);
			
 
				+    EXPECT_EQ(property_white_space, property_space);
			
 
				+
			
 
				+    auto property_emoji_presentation = property("Emoji_Presentation"sv);
			
 
				+    auto property_epres = property("EPres"sv);
			
 
				+    EXPECT_EQ(property_emoji_presentation, property_epres);
			
 
				+
			
 
				+    for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000)
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_any));
			
 
				+
			
 
				+    for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) {
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_any));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
			
 
				+
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
			
 
				+    }
			
 
				+
			
 
				+    for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_any));
			
 
				+
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_assigned));
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
			
 
				+    }
			
 
				+
			
 
				+    for (u32 code_point = 0; code_point <= 0x7f; ++code_point) {
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_any));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
			
 
				+
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
			
 
				+    }
			
 
				+
			
 
				+    for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) {
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_any));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_white_space));
			
 
				+
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
			
 
				+    }
			
 
				+
			
 
				+    for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) {
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_any));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
			
 
				+        EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation));
			
 
				+
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
			
 
				+        EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
			
 
				+    }
			
 
				+}
			
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -22,11 +22,6 @@ namespace Unicode {
 
				 
			
 
				 #if ENABLE_UNICODE_DATA
			
 
				 
			
 
				-static bool has_property(UnicodeData const& unicode_data, Property property)
			
 
				-{
			
 
				-    return (unicode_data.properties & property) == property;
			
 
				-}
			
 
				-
			
 
				 static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
			
 
				 {
			
 
				     // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
			
@@ -40,12 +35,8 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
 
				     size_t cased_letter_count = 0;
			
 
				 
			
 
				     for (auto code_point : preceding_view) {
			
 
				-        auto unicode_data = Detail::unicode_data_for_code_point(code_point);
			
 
				-        if (!unicode_data.has_value())
			
 
				-            return false;
			
 
				-
			
 
				-        bool is_cased = has_property(*unicode_data, Property::Cased);
			
 
				-        bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
			
 
				+        bool is_cased = code_point_has_property(code_point, Property::Cased);
			
 
				+        bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
			
 
				 
			
 
				         if (is_cased && !is_case_ignorable)
			
 
				             ++cased_letter_count;
			
@@ -57,12 +48,8 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
 
				         return false;
			
 
				 
			
 
				     for (auto code_point : following_view) {
			
 
				-        auto unicode_data = Detail::unicode_data_for_code_point(code_point);
			
 
				-        if (!unicode_data.has_value())
			
 
				-            return false;
			
 
				-
			
 
				-        bool is_cased = has_property(*unicode_data, Property::Cased);
			
 
				-        bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
			
 
				+        bool is_cased = code_point_has_property(code_point, Property::Cased);
			
 
				+        bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
			
 
				 
			
 
				         if (is_case_ignorable)
			
 
				             continue;
			
@@ -227,14 +214,7 @@ Optional<Property> property_from_string([[maybe_unused]] StringView const& prope
 
				 bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
			
 
				 {
			
 
				 #if ENABLE_UNICODE_DATA
			
 
				-    if (property == Property::Any)
			
 
				-        return is_unicode(code_point);
			
 
				-
			
 
				-    auto unicode_data = Detail::unicode_data_for_code_point(code_point);
			
 
				-    if (!unicode_data.has_value())
			
 
				-        return false;
			
 
				-
			
 
				-    return has_property(*unicode_data, property);
			
 
				+    return Detail::code_point_has_property(code_point, property);
			
 
				 #else
			
 
				     return false;
			
 
				 #endif
			
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -68,7 +68,6 @@ struct CodePointData {
 
				     Optional<u32> simple_lowercase_mapping;
			
 
				     Optional<u32> simple_titlecase_mapping;
			
 
				     Vector<u32> special_casing_indices;
			
 
				-    Vector<StringView> prop_list;
			
 
				     StringView script;
			
 
				     Vector<StringView> script_extensions;
			
 
				 };
			
@@ -87,12 +86,11 @@ struct UnicodeData {
 
				     Vector<Alias> general_category_aliases;
			
 
				 
			
 
				     // The Unicode standard defines additional properties (Any, Assigned, ASCII) which are not in
			
 
				-    // any UCD file. Assigned is set as the default enum value 0 so "property & Assigned == Assigned"
			
 
				-    // is always true. Any is not assigned code points here because this file only parses assigned
			
 
				-    // code points, whereas Any will include unassigned code points.
			
 
				+    // any UCD file. Assigned code point ranges are derived as this generator is executed.
			
 
				     // https://unicode.org/reports/tr18/#General_Category_Property
			
 
				     PropList prop_list {
			
 
				-        { "Any"sv, {} },
			
 
				+        { "Any"sv, { { 0, 0x10ffff } } },
			
 
				+        { "Assigned"sv, {} },
			
 
				         { "ASCII"sv, { { 0, 0x7f } } },
			
 
				     };
			
 
				     Vector<Alias> prop_aliases;
			
@@ -309,6 +307,10 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
 
				 {
			
 
				     Optional<u32> code_point_range_start;
			
 
				 
			
 
				+    auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value;
			
 
				+    Optional<u32> assigned_code_point_range_start = 0;
			
 
				+    u32 previous_code_point = 0;
			
 
				+
			
 
				     auto assign_code_point_property = [&](u32 code_point, auto const& list, auto& property, StringView default_) {
			
 
				         using PropertyType = RemoveCVReference<decltype(property)>;
			
 
				         constexpr bool is_single_item = IsSame<PropertyType, StringView>;
			
@@ -361,18 +363,31 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
 
				         data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
			
 
				         data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
			
 
				 
			
 
				+        if (!assigned_code_point_range_start.has_value())
			
 
				+            assigned_code_point_range_start = data.code_point;
			
 
				+
			
 
				         if (data.name.starts_with("<"sv) && data.name.ends_with(", First>")) {
			
 
				-            VERIFY(!code_point_range_start.has_value());
			
 
				+            VERIFY(!code_point_range_start.has_value() && assigned_code_point_range_start.has_value());
			
 
				             code_point_range_start = data.code_point;
			
 
				 
			
 
				             data.name = data.name.substring(1, data.name.length() - 9);
			
 
				+
			
 
				+            assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
			
 
				+            assigned_code_point_range_start.clear();
			
 
				         } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>")) {
			
 
				             VERIFY(code_point_range_start.has_value());
			
 
				 
			
 
				-            unicode_data.code_point_ranges.append({ *code_point_range_start, data.code_point });
			
 
				-            data.name = data.name.substring(1, data.name.length() - 8);
			
 
				+            CodePointRange code_point_range { *code_point_range_start, data.code_point };
			
 
				+            unicode_data.code_point_ranges.append(code_point_range);
			
 
				+            assigned_code_points.append(code_point_range);
			
 
				 
			
 
				+            data.name = data.name.substring(1, data.name.length() - 8);
			
 
				             code_point_range_start.clear();
			
 
				+        } else if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) {
			
 
				+            VERIFY(assigned_code_point_range_start.has_value());
			
 
				+
			
 
				+            assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
			
 
				+            assigned_code_point_range_start = data.code_point;
			
 
				         }
			
 
				 
			
 
				         for (auto const& casing : unicode_data.special_casing) {
			
@@ -380,12 +395,13 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
 
				                 data.special_casing_indices.append(casing.index);
			
 
				         }
			
 
				 
			
 
				-        assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
			
 
				         assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
			
 
				         assign_code_point_property(data.code_point, unicode_data.script_extensions, data.script_extensions, {});
			
 
				 
			
 
				         unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
			
 
				         unicode_data.largest_script_extensions_size = max(unicode_data.largest_script_extensions_size, data.script_extensions.size());
			
 
				+
			
 
				+        previous_code_point = data.code_point;
			
 
				         unicode_data.code_point_data.append(move(data));
			
 
				     }
			
 
				 }
			
@@ -398,17 +414,15 @@ static void generate_unicode_data_header(Core::File& file, UnicodeData& unicode_
 
				     generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
			
 
				     generator.set("script_extensions_size", String::number(unicode_data.largest_script_extensions_size));
			
 
				 
			
 
				-    auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> aliases = {}, bool as_bitmask = false) {
			
 
				-        VERIFY(!as_bitmask || (values.size() <= 64));
			
 
				+    auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> aliases = {}) {
			
 
				         quick_sort(values);
			
 
				         quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
			
 
				 
			
 
				         generator.set("name", name);
			
 
				         generator.set("underlying", String::formatted("{}UnderlyingType", name));
			
 
				-        generator.set("underlying_type", as_bitmask ? "u64"sv : "u8"sv);
			
 
				 
			
 
				         generator.append(R"~~~(
			
 
				-using @underlying@ = @underlying_type@;
			
 
				+using @underlying@ = u8;
			
 
				 
			
 
				 enum class @name@ : @underlying@ {)~~~");
			
 
				 
			
@@ -418,18 +432,10 @@ enum class @name@ : @underlying@ {)~~~");
 
				     @default@,)~~~");
			
 
				         }
			
 
				 
			
 
				-        u8 index = 0;
			
 
				         for (auto const& value : values) {
			
 
				             generator.set("value", value);
			
 
				-
			
 
				-            if (as_bitmask) {
			
 
				-                generator.set("index", String::number(index++));
			
 
				-                generator.append(R"~~~(
			
 
				-    @value@ = static_cast<@underlying@>(1) << @index@,)~~~");
			
 
				-            } else {
			
 
				-                generator.append(R"~~~(
			
 
				+            generator.append(R"~~~(
			
 
				     @value@,)~~~");
			
 
				-            }
			
 
				         }
			
 
				 
			
 
				         for (auto const& alias : aliases) {
			
@@ -442,20 +448,6 @@ enum class @name@ : @underlying@ {)~~~");
 
				         generator.append(R"~~~(
			
 
				 };
			
 
				 )~~~");
			
 
				-
			
 
				-        if (as_bitmask) {
			
 
				-            generator.append(R"~~~(
			
 
				-constexpr @name@ operator&(@name@ value1, @name@ value2)
			
 
				-{
			
 
				-    return static_cast<@name@>(static_cast<@underlying@>(value1) & static_cast<@underlying@>(value2));
			
 
				-}
			
 
				-
			
 
				-constexpr @name@ operator|(@name@ value1, @name@ value2)
			
 
				-{
			
 
				-    return static_cast<@name@>(static_cast<@underlying@>(value1) | static_cast<@underlying@>(value2));
			
 
				-}
			
 
				-)~~~");
			
 
				-        }
			
 
				     };
			
 
				 
			
 
				     generator.append(R"~~~(
			
@@ -471,7 +463,7 @@ namespace Unicode {
 
				     generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
			
 
				     generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
			
 
				     generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
			
 
				-    generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true);
			
 
				+    generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
			
 
				     generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
			
 
				 
			
 
				     generator.append(R"~~~(
			
@@ -524,8 +516,6 @@ struct UnicodeData {
 
				     SpecialCasing const* special_casing[@special_casing_size@] {};
			
 
				     u32 special_casing_size { 0 };
			
 
				 
			
 
				-    Property properties { Property::Assigned };
			
 
				-
			
 
				     Script script { Script::Unknown };
			
 
				     Script script_extensions[@script_extensions_size@];
			
 
				     u32 script_extensions_size { 0 };
			
@@ -538,6 +528,7 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
 
				 bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
			
 
				 Optional<GeneralCategory> general_category_from_string(StringView const& general_category);
			
 
				 
			
 
				+bool code_point_has_property(u32 code_point, Property property);
			
 
				 Optional<Property> property_from_string(StringView const& property);
			
 
				 
			
 
				 Optional<Script> script_from_string(StringView const& script);
			
@@ -639,14 +630,6 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
 
				         append_field("simple_lowercase_mapping", String::formatted("{:#x}", data.simple_lowercase_mapping.value_or(data.code_point)));
			
 
				         append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point)));
			
 
				         append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
			
 
				-
			
 
				-        bool first = true;
			
 
				-        for (auto const& property : data.prop_list) {
			
 
				-            generator.append(first ? ", " : " | ");
			
 
				-            generator.append(String::formatted("Property::{}", property));
			
 
				-            first = false;
			
 
				-        }
			
 
				-
			
 
				         generator.append(String::formatted(", Script::{}", data.script));
			
 
				         append_list_and_size(data.script_extensions, "Script::{}"sv);
			
 
				         generator.append(" },");
			
@@ -724,6 +707,7 @@ static constexpr Array<Span<CodePointRange const>, @size@> @name@ { {)~~~");
 
				     };
			
 
				 
			
 
				     append_prop_list("s_general_categories"sv, "s_general_category_{}"sv, unicode_data.general_categories);
			
 
				+    append_prop_list("s_properties"sv, "s_property_{}"sv, unicode_data.prop_list);
			
 
				 
			
 
				     generator.append(R"~~~(
			
 
				 static HashMap<u32, UnicodeData const*> const& ensure_code_point_map()
			
@@ -812,7 +796,11 @@ Optional<GeneralCategory> general_category_from_string(StringView const& general
 
				     generator.append(R"~~~(
			
 
				     return {};
			
 
				 }
			
 
				+)~~~");
			
 
				+
			
 
				+    append_prop_search("Property"sv, "property"sv, "s_properties"sv);
			
 
				 
			
 
				+    generator.append(R"~~~(
			
 
				 Optional<Property> property_from_string(StringView const& property)
			
 
				 {
			
 
				     if (property == "Assigned"sv)
			
--- a/Userland/Libraries/LibUnicode/Forward.h
+++ b/Userland/Libraries/LibUnicode/Forward.h
@@ -13,7 +13,7 @@ namespace Unicode {
 
				 enum class Condition : u8;
			
 
				 enum class GeneralCategory : u8;
			
 
				 enum class Locale : u8;
			
 
				-enum class Property : u64;
			
 
				+enum class Property : u8;
			
 
				 enum class Script : u8;
			
 
				 enum class WordBreakProperty : u8;