Browse Source

LibUnicode: Generate code point display names with run-length encoding

Similar to commit becec35, our code point display name data was a large
list of StringViews. RLE can be used here as well to remove about 32 MB
from the initialized data section to the read-only section.

Some of the refactoring to store strings as indices into an RLE array
also lets us clean up some of the code point name generators.
Timothy Flynn 2 years ago
parent
commit
ca92e37ae0
1 changed files with 114 additions and 81 deletions
  1. 114 81
      Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp

+ 114 - 81
Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp

@@ -20,6 +20,9 @@
 #include <LibCore/ArgsParser.h>
 #include <LibCore/ArgsParser.h>
 #include <LibCore/Stream.h>
 #include <LibCore/Stream.h>
 
 
+using StringIndexType = u16;
+constexpr auto s_string_index_type = "u16"sv;
+
 // Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code
 // Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code
 // points, as indicated by the "name" field. For example:
 // points, as indicated by the "name" field. For example:
 //     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 //     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
@@ -63,7 +66,7 @@ using NormalizationProps = HashMap<String, Vector<Normalization>>;
 
 
 struct CodePointName {
 struct CodePointName {
     CodePointRange code_point_range;
     CodePointRange code_point_range;
-    StringView name;
+    StringIndexType name { 0 };
 };
 };
 
 
 // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
 // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
@@ -72,7 +75,7 @@ struct CodePointName {
 struct CodePointData {
 struct CodePointData {
     u32 code_point { 0 };
     u32 code_point { 0 };
     String name;
     String name;
-    Optional<StringView> abbreviation;
+    Optional<StringIndexType> abbreviation;
     u8 canonical_combining_class { 0 };
     u8 canonical_combining_class { 0 };
     String bidi_class;
     String bidi_class;
     String decomposition_type;
     String decomposition_type;
@@ -90,10 +93,12 @@ struct CodePointData {
 
 
 struct BlockName {
 struct BlockName {
     CodePointRange code_point_range;
     CodePointRange code_point_range;
-    String name;
+    StringIndexType name { 0 };
 };
 };
 
 
 struct UnicodeData {
 struct UnicodeData {
+    UniqueStringStorage<StringIndexType> unique_strings;
+
     u32 code_points_with_non_zero_combining_class { 0 };
     u32 code_points_with_non_zero_combining_class { 0 };
 
 
     u32 simple_uppercase_mapping_size { 0 };
     u32 simple_uppercase_mapping_size { 0 };
@@ -107,8 +112,8 @@ struct UnicodeData {
 
 
     Vector<CodePointData> code_point_data;
     Vector<CodePointData> code_point_data;
 
 
-    HashMap<u32, String> code_point_abbreviations;
-    HashMap<u32, String> code_point_display_name_aliases;
+    HashMap<u32, StringIndexType> code_point_abbreviations;
+    HashMap<u32, StringIndexType> code_point_display_name_aliases;
     Vector<CodePointName> code_point_display_names;
     Vector<CodePointName> code_point_display_names;
 
 
     PropList general_categories;
     PropList general_categories;
@@ -355,10 +360,13 @@ static ErrorOr<void> parse_name_aliases(Core::Stream::BufferedFile& file, Unicod
         auto reason = segments[2].trim_whitespace();
         auto reason = segments[2].trim_whitespace();
 
 
         if (reason == "abbreviation"sv) {
         if (reason == "abbreviation"sv) {
-            unicode_data.code_point_abbreviations.set(*code_point, alias);
+            auto index = unicode_data.unique_strings.ensure(alias);
+            unicode_data.code_point_abbreviations.set(*code_point, index);
         } else if (reason.is_one_of("correction"sv, "control"sv)) {
         } else if (reason.is_one_of("correction"sv, "control"sv)) {
-            if (!unicode_data.code_point_display_name_aliases.contains(*code_point))
-                unicode_data.code_point_display_name_aliases.set(*code_point, alias);
+            if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) {
+                auto index = unicode_data.unique_strings.ensure(alias);
+                unicode_data.code_point_display_name_aliases.set(*code_point, index);
+            }
         }
         }
     }
     }
 
 
@@ -459,8 +467,13 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name,
     // https://www.unicode.org/versions/Unicode14.0.0/ch04.pdf#G142981
     // https://www.unicode.org/versions/Unicode14.0.0/ch04.pdf#G142981
     // FIXME: Implement the NR1 rules for Hangul syllables.
     // FIXME: Implement the NR1 rules for Hangul syllables.
 
 
+    struct CodePointNameFormat {
+        CodePointRange code_point_range;
+        StringView name;
+    };
+
     // These code point ranges are the NR2 set of name replacements defined by Table 4-8.
     // These code point ranges are the NR2 set of name replacements defined by Table 4-8.
-    constexpr Array<CodePointName, 15> s_ideographic_replacements { {
+    constexpr Array<CodePointNameFormat, 15> s_ideographic_replacements { {
         { { 0x3400, 0x4DBF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
         { { 0x3400, 0x4DBF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
         { { 0x4E00, 0x9FFC }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
         { { 0x4E00, 0x9FFC }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
         { { 0xF900, 0xFA6D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
         { { 0xF900, 0xFA6D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
@@ -484,7 +497,8 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name,
         });
         });
 
 
     if (it != s_ideographic_replacements.end()) {
     if (it != s_ideographic_replacements.end()) {
-        unicode_data.code_point_display_names.append(*it);
+        auto index = unicode_data.unique_strings.ensure(it->name);
+        unicode_data.code_point_display_names.append({ it->code_point_range, index });
         return;
         return;
     }
     }
 
 
@@ -505,7 +519,8 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name,
         return;
         return;
     }
     }
 
 
-    unicode_data.code_point_display_names.append({ range, name });
+    auto index = unicode_data.unique_strings.ensure(name);
+    unicode_data.code_point_display_names.append({ range, index });
 }
 }
 
 
 static ErrorOr<void> parse_block_display_names(Core::Stream::BufferedFile& file, UnicodeData& unicode_data)
 static ErrorOr<void> parse_block_display_names(Core::Stream::BufferedFile& file, UnicodeData& unicode_data)
@@ -521,7 +536,9 @@ static ErrorOr<void> parse_block_display_names(Core::Stream::BufferedFile& file,
 
 
         auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
         auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
         auto display_name = segments[1].trim_whitespace();
         auto display_name = segments[1].trim_whitespace();
-        unicode_data.block_display_names.append({ code_point_range, display_name });
+
+        auto index = unicode_data.unique_strings.ensure(display_name);
+        unicode_data.block_display_names.append({ code_point_range, index });
     }
     }
 
 
     TRY(file.seek(0, Core::Stream::SeekMode::SetPosition));
     TRY(file.seek(0, Core::Stream::SeekMode::SetPosition));
@@ -713,6 +730,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
     StringBuilder builder;
     StringBuilder builder;
     SourceGenerator generator { builder };
     SourceGenerator generator { builder };
 
 
+    generator.set("string_index_type"sv, s_string_index_type);
     generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size));
     generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size));
     generator.set("special_casing_size", String::number(unicode_data.special_casing.size()));
     generator.set("special_casing_size", String::number(unicode_data.special_casing.size()));
 
 
@@ -730,6 +748,8 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
 namespace Unicode {
 namespace Unicode {
 )~~~");
 )~~~");
 
 
+    unicode_data.unique_strings.generate(generator);
+
     auto append_list_and_size = [&](auto const& list, StringView format) {
     auto append_list_and_size = [&](auto const& list, StringView format) {
         if (list.is_empty()) {
         if (list.is_empty()) {
             generator.append(", {}, 0");
             generator.append(", {}, 0");
@@ -784,7 +804,7 @@ struct SpecialCaseMapping {
 
 
 struct CodePointAbbreviation {
 struct CodePointAbbreviation {
     u32 code_point { 0 };
     u32 code_point { 0 };
-    StringView abbreviation {};
+    @string_index_type@ abbreviation { 0 };
 };
 };
 
 
 template<typename MappingType>
 template<typename MappingType>
@@ -794,6 +814,37 @@ struct CodePointComparator {
         return code_point - mapping.code_point;
         return code_point - mapping.code_point;
     }
     }
 };
 };
+
+struct CodePointRangeComparator {
+    constexpr int operator()(u32 code_point, CodePointRange const& range)
+    {
+        return (code_point > range.last) - (code_point < range.first);
+    }
+};
+
+struct BlockNameData {
+    CodePointRange code_point_range {};
+    @string_index_type@ display_name { 0 };
+};
+
+struct BlockNameComparator : public CodePointRangeComparator {
+    constexpr int operator()(u32 code_point, BlockNameData const& name)
+    {
+        return CodePointRangeComparator::operator()(code_point, name.code_point_range);
+    }
+};
+
+struct CodePointName {
+    CodePointRange code_point_range {};
+    @string_index_type@ display_name { 0 };
+};
+
+struct CodePointNameComparator : public CodePointRangeComparator {
+    constexpr int operator()(u32 code_point, CodePointName const& name)
+    {
+        return CodePointRangeComparator::operator()(code_point, name.code_point_range);
+    }
+};
 )~~~");
 )~~~");
 
 
     auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) {
     auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) {
@@ -825,12 +876,9 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
             generator.set("code_point", String::formatted("{:#x}", data.code_point));
             generator.set("code_point", String::formatted("{:#x}", data.code_point));
             generator.append("{ @code_point@");
             generator.append("{ @code_point@");
 
 
-            if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
+            if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<StringIndexType>>) {
                 generator.set("mapping", String::formatted("{:#x}", *mapping));
                 generator.set("mapping", String::formatted("{:#x}", *mapping));
                 generator.append(", @mapping@ },");
                 generator.append(", @mapping@ },");
-            } else if constexpr (IsSame<decltype(mapping), Optional<StringView>>) {
-                generator.set("mapping", String::formatted("{}", *mapping));
-                generator.append(", \"@mapping@\"sv },");
             } else {
             } else {
                 append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
                 append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
                 generator.append(" },");
                 generator.append(" },");
@@ -857,16 +905,6 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
     append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
     append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
     append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
     append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
 
 
-    generator.append(R"~~~(
-struct CodePointRangeComparator {
-    constexpr int operator()(u32 code_point, CodePointRange const& range)
-    {
-        return (code_point > range.last) - (code_point < range.first);
-    }
-};
-
-)~~~");
-
     auto append_code_point_range_list = [&](String name, Vector<CodePointRange> const& ranges) {
     auto append_code_point_range_list = [&](String name, Vector<CodePointRange> const& ranges) {
         generator.set("name", name);
         generator.set("name", name);
         generator.set("size", String::number(ranges.size()));
         generator.set("size", String::number(ranges.size()));
@@ -930,80 +968,73 @@ static constexpr Array<Span<CodePointRange const>, @size@> @name@ { {)~~~");
     append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props);
     append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props);
     append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props);
     append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props);
 
 
-    generator.append(R"~~~(
-struct BlockNameComparator : public CodePointRangeComparator {
-    constexpr int operator()(u32 code_point, BlockName const& name)
-    {
-        return CodePointRangeComparator::operator()(code_point, name.code_point_range);
-    }
-};
-)~~~");
+    auto append_code_point_display_names = [&](StringView type, StringView name, auto const& display_names) {
+        constexpr size_t max_values_per_row = 30;
+        size_t values_in_current_row = 0;
 
 
-    generator.set("block_display_names_size", String::number(unicode_data.block_display_names.size()));
-    generator.append(R"~~~(
-static constexpr Array<BlockName, @block_display_names_size@> s_block_display_names { {
-)~~~");
-    for (auto const& block_name : unicode_data.block_display_names) {
-        generator.set("first", String::formatted("{:#x}", block_name.code_point_range.first));
-        generator.set("last", String::formatted("{:#x}", block_name.code_point_range.last));
-        generator.set("name", block_name.name);
-        generator.append(R"~~~(    { { @first@, @last@ }, "@name@"sv },
-)~~~");
-    }
-    generator.append(R"~~~(} };
+        generator.set("type", type);
+        generator.set("name", name);
+        generator.set("size", String::number(display_names.size()));
+
+        generator.append(R"~~~(
+static constexpr Array<@type@, @size@> @name@ { {
+    )~~~");
+        for (auto const& display_name : display_names) {
+            if (values_in_current_row++ > 0)
+                generator.append(", ");
+
+            generator.set("first", String::formatted("{:#x}", display_name.code_point_range.first));
+            generator.set("last", String::formatted("{:#x}", display_name.code_point_range.last));
+            generator.set("name", String::number(display_name.name));
+            generator.append("{ { @first@, @last@ }, @name@ }");
+
+            if (values_in_current_row == max_values_per_row) {
+                values_in_current_row = 0;
+                generator.append(",\n    ");
+            }
+        }
+        generator.append(R"~~~(
+} };
 )~~~");
 )~~~");
+    };
+
+    append_code_point_display_names("BlockNameData"sv, "s_block_display_names"sv, unicode_data.block_display_names);
+    append_code_point_display_names("CodePointName"sv, "s_code_point_display_names"sv, unicode_data.code_point_display_names);
 
 
     generator.append(R"~~~(
     generator.append(R"~~~(
 Optional<StringView> code_point_block_display_name(u32 code_point)
 Optional<StringView> code_point_block_display_name(u32 code_point)
 {
 {
     if (auto const* entry = binary_search(s_block_display_names, code_point, nullptr, BlockNameComparator {}))
     if (auto const* entry = binary_search(s_block_display_names, code_point, nullptr, BlockNameComparator {}))
-        return entry->display_name;
+        return decode_string(entry->display_name);
 
 
     return {};
     return {};
 }
 }
 
 
 Span<BlockName const> block_display_names()
 Span<BlockName const> block_display_names()
 {
 {
-    return s_block_display_names;
-}
-)~~~");
+    static auto display_names = []() {
+        Array<BlockName, s_block_display_names.size()> display_names;
 
 
-    generator.append(R"~~~(
-struct CodePointName {
-    CodePointRange code_point_range {};
-    StringView display_name;
-};
+        for (size_t i = 0; i < s_block_display_names.size(); ++i) {
+            auto const& display_name = s_block_display_names[i];
+            display_names[i] = { display_name.code_point_range, decode_string(display_name.display_name) };
+        }
 
 
-struct CodePointNameComparator : public CodePointRangeComparator {
-    constexpr int operator()(u32 code_point, CodePointName const& name)
-    {
-        return CodePointRangeComparator::operator()(code_point, name.code_point_range);
-    }
-};
-)~~~");
+        return display_names;
+    }();
 
 
-    generator.set("code_point_display_names_size", String::number(unicode_data.code_point_display_names.size()));
-    generator.append(R"~~~(
-static constexpr Array<CodePointName, @code_point_display_names_size@> s_code_point_display_names { {
-)~~~");
-    for (auto const& code_point_name : unicode_data.code_point_display_names) {
-        generator.set("first", String::formatted("{:#x}", code_point_name.code_point_range.first));
-        generator.set("last", String::formatted("{:#x}", code_point_name.code_point_range.last));
-        generator.set("name", code_point_name.name);
-        generator.append(R"~~~(    { { @first@, @last@ }, "@name@"sv },
-)~~~");
-    }
-    generator.append(R"~~~(} };
-)~~~");
+    return display_names.span();
+}
 
 
-    generator.append(R"~~~(
 Optional<String> code_point_display_name(u32 code_point)
 Optional<String> code_point_display_name(u32 code_point)
 {
 {
     if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) {
     if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) {
-        if (entry->display_name.ends_with("{:X}"sv))
-            return String::formatted(entry->display_name, code_point);
+        auto display_name = decode_string(entry->display_name);
 
 
-        return entry->display_name;
+        if (display_name.ends_with("{:X}"sv))
+            return String::formatted(display_name, code_point);
+
+        return display_name;
     }
     }
 
 
     return {};
     return {};
@@ -1042,8 +1073,10 @@ Optional<StringView> code_point_abbreviation(u32 code_point)
     auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
     auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
     if (mapping == nullptr)
     if (mapping == nullptr)
         return {};
         return {};
+    if (mapping->abbreviation == 0)
+        return {};
 
 
-    return mapping->abbreviation;
+    return decode_string(mapping->abbreviation);
 }
 }
 )~~~");
 )~~~");