Explorar el Código

LibUnicode: Generate code point abbreviations

Timothy Flynn hace 3 años
padre
commit
701b7810ba

+ 30 - 6
Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp

@@ -72,6 +72,7 @@ struct CodePointName {
 struct CodePointData {
     u32 code_point { 0 };
     String name;
+    Optional<StringView> abbreviation;
     u8 canonical_combining_class { 0 };
     String bidi_class;
     String decomposition_type;
@@ -101,6 +102,7 @@ struct UnicodeData {
 
     Vector<CodePointData> code_point_data;
 
+    HashMap<u32, String> code_point_abbreviations;
     HashMap<u32, String> code_point_display_name_aliases;
     Vector<CodePointName> code_point_display_names;
 
@@ -302,11 +304,12 @@ static void parse_name_aliases(Core::File& file, UnicodeData& unicode_data)
         auto alias = segments[1].trim_whitespace();
         auto reason = segments[2].trim_whitespace();
 
-        if (!reason.is_one_of("correction"sv, "control"sv))
-            continue;
-
-        if (!unicode_data.code_point_display_name_aliases.contains(*code_point))
-            unicode_data.code_point_display_name_aliases.set(*code_point, alias);
+        if (reason == "abbreviation"sv) {
+            unicode_data.code_point_abbreviations.set(*code_point, alias);
+        } else if (reason.is_one_of("correction"sv, "control"sv)) {
+            if (!unicode_data.code_point_display_name_aliases.contains(*code_point))
+                unicode_data.code_point_display_name_aliases.set(*code_point, alias);
+        }
     }
 }
 
@@ -476,6 +479,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
         data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
         data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
 
+        if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value())
+            data.abbreviation = *abbreviation;
+
         if (!assigned_code_point_range_start.has_value())
             assigned_code_point_range_start = data.code_point;
 
@@ -683,6 +689,11 @@ struct SpecialCaseMapping {
     u32 special_casing_size { 0 };
 };
 
+struct CodePointAbbreviation {
+    u32 code_point { 0 };
+    StringView abbreviation {};
+};
+
 template<typename MappingType>
 struct CodePointComparator {
     constexpr int operator()(u32 code_point, MappingType const& mapping)
@@ -707,7 +718,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
         for (auto const& data : unicode_data.code_point_data) {
             auto mapping = mapping_getter(data);
 
-            if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
+            if constexpr (requires { mapping.has_value(); }) {
                 if (!mapping.has_value())
                     continue;
             } else {
@@ -724,6 +735,9 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
             if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
                 generator.set("mapping", String::formatted("{:#x}", *mapping));
                 generator.append(", @mapping@ },");
+            } else if constexpr (IsSame<decltype(mapping), Optional<StringView>>) {
+                generator.set("mapping", String::formatted("{}", *mapping));
+                generator.append(", \"@mapping@\"sv },");
             } else {
                 append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
                 generator.append(" },");
@@ -748,6 +762,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
     append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
     append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
     append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
+    append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
 
     generator.append(R"~~~(
 struct CodePointRange {
@@ -891,6 +906,15 @@ Span<SpecialCasing const* const> special_case_mapping(u32 code_point)
 
     return mapping->special_casing.span().slice(0, mapping->special_casing_size);
 }
+
+Optional<StringView> code_point_abbreviation(u32 code_point)
+{
+    auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
+    if (mapping == nullptr)
+        return {};
+
+    return mapping->abbreviation;
+}
 )~~~");
 
     auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {

+ 1 - 0
Userland/Libraries/LibUnicode/CharacterTypes.cpp

@@ -22,6 +22,7 @@
 namespace Unicode {
 
 Optional<String> __attribute__((weak)) code_point_display_name(u32) { return {}; }
+Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
 u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
 Span<SpecialCasing const* const> __attribute__((weak)) special_case_mapping(u32) { return {}; }
 

+ 2 - 0
Userland/Libraries/LibUnicode/CharacterTypes.h

@@ -16,6 +16,8 @@
 namespace Unicode {
 
 Optional<String> code_point_display_name(u32 code_point);
+Optional<StringView> code_point_abbreviation(u32 code_point);
+
 u32 canonical_combining_class(u32 code_point);
 Span<SpecialCasing const* const> special_case_mapping(u32 code_point);