From ca92e37ae0b851ccda2c8c52a379fa5ca0c376b8 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 17 Aug 2022 09:52:26 -0400 Subject: [PATCH] LibUnicode: Generate code point display names with run-length encoding Similar to commit becec35, our code point display name data was a large list of StringViews. RLE can be used here as well to remove about 32 MB from the initialized data section to the read-only section. Some of the refactoring to store strings as indices into an RLE array also lets us clean up some of the code point name generators. --- .../LibUnicode/GenerateUnicodeData.cpp | 199 ++++++++++-------- 1 file changed, 116 insertions(+), 83 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 29067bb499f..4330628d869 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -20,6 +20,9 @@ #include #include +using StringIndexType = u16; +constexpr auto s_string_index_type = "u16"sv; + // Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code // points, as indicated by the "name" field. For example: // 3400;;Lo;0;L;;;;;N;;;;; @@ -63,7 +66,7 @@ using NormalizationProps = HashMap>; struct CodePointName { CodePointRange code_point_range; - StringView name; + StringIndexType name { 0 }; }; // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt @@ -72,7 +75,7 @@ struct CodePointName { struct CodePointData { u32 code_point { 0 }; String name; - Optional abbreviation; + Optional abbreviation; u8 canonical_combining_class { 0 }; String bidi_class; String decomposition_type; @@ -90,10 +93,12 @@ struct CodePointData { struct BlockName { CodePointRange code_point_range; - String name; + StringIndexType name { 0 }; }; struct UnicodeData { + UniqueStringStorage unique_strings; + u32 code_points_with_non_zero_combining_class { 0 }; u32 simple_uppercase_mapping_size { 0 }; @@ -107,8 +112,8 @@ struct UnicodeData { Vector code_point_data; - HashMap code_point_abbreviations; - HashMap code_point_display_name_aliases; + HashMap code_point_abbreviations; + HashMap code_point_display_name_aliases; Vector code_point_display_names; PropList general_categories; @@ -355,10 +360,13 @@ static ErrorOr parse_name_aliases(Core::Stream::BufferedFile& file, Unicod auto reason = segments[2].trim_whitespace(); if (reason == "abbreviation"sv) { - unicode_data.code_point_abbreviations.set(*code_point, alias); + auto index = unicode_data.unique_strings.ensure(alias); + unicode_data.code_point_abbreviations.set(*code_point, index); } else if (reason.is_one_of("correction"sv, "control"sv)) { - if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) - unicode_data.code_point_display_name_aliases.set(*code_point, alias); + if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) { + auto index = unicode_data.unique_strings.ensure(alias); + unicode_data.code_point_display_name_aliases.set(*code_point, index); + } } } @@ -459,8 +467,13 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, // https://www.unicode.org/versions/Unicode14.0.0/ch04.pdf#G142981 // FIXME: Implement the NR1 rules for Hangul syllables. + struct CodePointNameFormat { + CodePointRange code_point_range; + StringView name; + }; + // These code point ranges are the NR2 set of name replacements defined by Table 4-8. - constexpr Array s_ideographic_replacements { { + constexpr Array s_ideographic_replacements { { { { 0x3400, 0x4DBF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv }, { { 0x4E00, 0x9FFC }, "CJK UNIFIED IDEOGRAPH-{:X}"sv }, { { 0xF900, 0xFA6D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv }, @@ -484,7 +497,8 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, }); if (it != s_ideographic_replacements.end()) { - unicode_data.code_point_display_names.append(*it); + auto index = unicode_data.unique_strings.ensure(it->name); + unicode_data.code_point_display_names.append({ it->code_point_range, index }); return; } @@ -505,7 +519,8 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, return; } - unicode_data.code_point_display_names.append({ range, name }); + auto index = unicode_data.unique_strings.ensure(name); + unicode_data.code_point_display_names.append({ range, index }); } static ErrorOr parse_block_display_names(Core::Stream::BufferedFile& file, UnicodeData& unicode_data) @@ -521,7 +536,9 @@ static ErrorOr parse_block_display_names(Core::Stream::BufferedFile& file, auto code_point_range = parse_code_point_range(segments[0].trim_whitespace()); auto display_name = segments[1].trim_whitespace(); - unicode_data.block_display_names.append({ code_point_range, display_name }); + + auto index = unicode_data.unique_strings.ensure(display_name); + unicode_data.block_display_names.append({ code_point_range, index }); } TRY(file.seek(0, Core::Stream::SeekMode::SetPosition)); @@ -713,6 +730,7 @@ static ErrorOr generate_unicode_data_implementation(Core::Stream::Buffered StringBuilder builder; SourceGenerator generator { builder }; + generator.set("string_index_type"sv, s_string_index_type); generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size)); generator.set("special_casing_size", String::number(unicode_data.special_casing.size())); @@ -730,6 +748,8 @@ static ErrorOr generate_unicode_data_implementation(Core::Stream::Buffered namespace Unicode { )~~~"); + unicode_data.unique_strings.generate(generator); + auto append_list_and_size = [&](auto const& list, StringView format) { if (list.is_empty()) { generator.append(", {}, 0"); @@ -784,7 +804,7 @@ struct SpecialCaseMapping { struct CodePointAbbreviation { u32 code_point { 0 }; - StringView abbreviation {}; + @string_index_type@ abbreviation { 0 }; }; template @@ -794,6 +814,37 @@ struct CodePointComparator { return code_point - mapping.code_point; } }; + +struct CodePointRangeComparator { + constexpr int operator()(u32 code_point, CodePointRange const& range) + { + return (code_point > range.last) - (code_point < range.first); + } +}; + +struct BlockNameData { + CodePointRange code_point_range {}; + @string_index_type@ display_name { 0 }; +}; + +struct BlockNameComparator : public CodePointRangeComparator { + constexpr int operator()(u32 code_point, BlockNameData const& name) + { + return CodePointRangeComparator::operator()(code_point, name.code_point_range); + } +}; + +struct CodePointName { + CodePointRange code_point_range {}; + @string_index_type@ display_name { 0 }; +}; + +struct CodePointNameComparator : public CodePointRangeComparator { + constexpr int operator()(u32 code_point, CodePointName const& name) + { + return CodePointRangeComparator::operator()(code_point, name.code_point_range); + } +}; )~~~"); auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) { @@ -825,12 +876,9 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { generator.set("code_point", String::formatted("{:#x}", data.code_point)); generator.append("{ @code_point@"); - if constexpr (IsSame>) { + if constexpr (IsSame> || IsSame>) { generator.set("mapping", String::formatted("{:#x}", *mapping)); generator.append(", @mapping@ },"); - } else if constexpr (IsSame>) { - generator.set("mapping", String::formatted("{}", *mapping)); - generator.append(", \"@mapping@\"sv },"); } else { append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); generator.append(" },"); @@ -857,16 +905,6 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); - generator.append(R"~~~( -struct CodePointRangeComparator { - constexpr int operator()(u32 code_point, CodePointRange const& range) - { - return (code_point > range.last) - (code_point < range.first); - } -}; - -)~~~"); - auto append_code_point_range_list = [&](String name, Vector const& ranges) { generator.set("name", name); generator.set("size", String::number(ranges.size())); @@ -930,80 +968,73 @@ static constexpr Array, @size@> @name@ { {)~~~"); append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props); append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props); - generator.append(R"~~~( -struct BlockNameComparator : public CodePointRangeComparator { - constexpr int operator()(u32 code_point, BlockName const& name) - { - return CodePointRangeComparator::operator()(code_point, name.code_point_range); - } -}; -)~~~"); + auto append_code_point_display_names = [&](StringView type, StringView name, auto const& display_names) { + constexpr size_t max_values_per_row = 30; + size_t values_in_current_row = 0; - generator.set("block_display_names_size", String::number(unicode_data.block_display_names.size())); - generator.append(R"~~~( -static constexpr Array s_block_display_names { { -)~~~"); - for (auto const& block_name : unicode_data.block_display_names) { - generator.set("first", String::formatted("{:#x}", block_name.code_point_range.first)); - generator.set("last", String::formatted("{:#x}", block_name.code_point_range.last)); - generator.set("name", block_name.name); - generator.append(R"~~~( { { @first@, @last@ }, "@name@"sv }, -)~~~"); - } - generator.append(R"~~~(} }; + generator.set("type", type); + generator.set("name", name); + generator.set("size", String::number(display_names.size())); + + generator.append(R"~~~( +static constexpr Array<@type@, @size@> @name@ { { + )~~~"); + for (auto const& display_name : display_names) { + if (values_in_current_row++ > 0) + generator.append(", "); + + generator.set("first", String::formatted("{:#x}", display_name.code_point_range.first)); + generator.set("last", String::formatted("{:#x}", display_name.code_point_range.last)); + generator.set("name", String::number(display_name.name)); + generator.append("{ { @first@, @last@ }, @name@ }"); + + if (values_in_current_row == max_values_per_row) { + values_in_current_row = 0; + generator.append(",\n "); + } + } + generator.append(R"~~~( +} }; )~~~"); + }; + + append_code_point_display_names("BlockNameData"sv, "s_block_display_names"sv, unicode_data.block_display_names); + append_code_point_display_names("CodePointName"sv, "s_code_point_display_names"sv, unicode_data.code_point_display_names); generator.append(R"~~~( Optional code_point_block_display_name(u32 code_point) { if (auto const* entry = binary_search(s_block_display_names, code_point, nullptr, BlockNameComparator {})) - return entry->display_name; + return decode_string(entry->display_name); return {}; } Span block_display_names() { - return s_block_display_names; + static auto display_names = []() { + Array display_names; + + for (size_t i = 0; i < s_block_display_names.size(); ++i) { + auto const& display_name = s_block_display_names[i]; + display_names[i] = { display_name.code_point_range, decode_string(display_name.display_name) }; + } + + return display_names; + }(); + + return display_names.span(); } -)~~~"); - generator.append(R"~~~( -struct CodePointName { - CodePointRange code_point_range {}; - StringView display_name; -}; - -struct CodePointNameComparator : public CodePointRangeComparator { - constexpr int operator()(u32 code_point, CodePointName const& name) - { - return CodePointRangeComparator::operator()(code_point, name.code_point_range); - } -}; -)~~~"); - - generator.set("code_point_display_names_size", String::number(unicode_data.code_point_display_names.size())); - generator.append(R"~~~( -static constexpr Array s_code_point_display_names { { -)~~~"); - for (auto const& code_point_name : unicode_data.code_point_display_names) { - generator.set("first", String::formatted("{:#x}", code_point_name.code_point_range.first)); - generator.set("last", String::formatted("{:#x}", code_point_name.code_point_range.last)); - generator.set("name", code_point_name.name); - generator.append(R"~~~( { { @first@, @last@ }, "@name@"sv }, -)~~~"); - } - generator.append(R"~~~(} }; -)~~~"); - - generator.append(R"~~~( Optional code_point_display_name(u32 code_point) { if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) { - if (entry->display_name.ends_with("{:X}"sv)) - return String::formatted(entry->display_name, code_point); + auto display_name = decode_string(entry->display_name); - return entry->display_name; + if (display_name.ends_with("{:X}"sv)) + return String::formatted(display_name, code_point); + + return display_name; } return {}; @@ -1042,8 +1073,10 @@ Optional code_point_abbreviation(u32 code_point) auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator {}); if (mapping == nullptr) return {}; + if (mapping->abbreviation == 0) + return {}; - return mapping->abbreviation; + return decode_string(mapping->abbreviation); } )~~~");