From f9e605397c13730bd1382b17069790e398bf428a Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sun, 10 Oct 2021 11:41:13 -0400 Subject: [PATCH] LibUnicode: Generate and use a set of unique locale-related strings In the generated UnicodeLocale.cpp file, there are 296,408 strings for localizations of languages, territories, scripts, currencies & keywords. Of these, only 43,848 (14.8%) are actually unique, so there are quite a large number of duplicated strings. This generates a single compile-time array to store these strings. The arrays for the localizations now store an index into this single array rather than duplicating any strings. --- .../LibUnicode/GenerateUnicodeLocale.cpp | 97 +++++++++++++++---- 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp index ee4bbb8d869..e2854b4a108 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp @@ -34,11 +34,11 @@ struct Locale { String language; Optional territory; Optional variant; - HashMap languages; - HashMap territories; - HashMap scripts; - HashMap currencies; - HashMap keywords; + HashMap languages; + HashMap territories; + HashMap scripts; + HashMap currencies; + HashMap keywords; Vector list_patterns; }; @@ -55,6 +55,8 @@ struct LanguageMapping { }; struct UnicodeLocaleData { + Vector unique_strings; + HashMap unique_string_indices; HashMap locales; Vector languages; Vector territories; @@ -74,6 +76,25 @@ struct UnicodeLocaleData { size_t max_variant_size { 0 }; }; +static size_t ensure_unique_string(UnicodeLocaleData& locale_data, String string) +{ + // We maintain a set of unique strings in two structures: a vector which owns the unique string, + // and a hash map which maps that string to its index in the vector. The vector is to ensure the + // strings are generated in an easily known order, and the map is to allow quickly deciding if a + // string is actually unique (otherwise, we'd have to linear-search the vector for each string). + // + // Also note that index 0 will be reserved for the empty string, so the index returned from this + // method is actually the real index in the vector + 1. + if (auto index = locale_data.unique_string_indices.get(string); index.has_value()) + return *index; + + locale_data.unique_strings.append(move(string)); + size_t index = locale_data.unique_strings.size(); + + locale_data.unique_string_indices.set(locale_data.unique_strings.last(), index); + return index; +} + static Optional parse_language(StringView language) { CanonicalLanguageID language_id {}; @@ -248,7 +269,8 @@ static void parse_locale_languages(String locale_path, UnicodeLocaleData& locale if (!locale_data.languages.contains_slow(key)) return; - locale.languages.set(key, value.as_string()); + size_t index = ensure_unique_string(locale_data, value.as_string()); + locale.languages.set(key, index); }); } @@ -273,7 +295,8 @@ static void parse_locale_territories(String locale_path, UnicodeLocaleData& loca if (!locale_data.territories.contains_slow(key)) return; - locale.territories.set(key, value.as_string()); + size_t index = ensure_unique_string(locale_data, value.as_string()); + locale.territories.set(key, index); }); } @@ -295,7 +318,9 @@ static void parse_locale_scripts(String locale_path, UnicodeLocaleData& locale_d auto const& scripts_object = locale_display_names_object.as_object().get("scripts"sv); scripts_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) { - locale.scripts.set(key, value.as_string()); + size_t index = ensure_unique_string(locale_data, value.as_string()); + locale.scripts.set(key, index); + if (!locale_data.scripts.contains_slow(key)) locale_data.scripts.append(key); }); @@ -372,7 +397,10 @@ static void parse_locale_currencies(String numbers_path, UnicodeLocaleData& loca currencies_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) { auto const& display_name = value.as_object().get("displayName"sv); - locale.currencies.set(key, display_name.as_string()); + + size_t index = ensure_unique_string(locale_data, display_name.as_string()); + locale.currencies.set(key, index); + if (!locale_data.currencies.contains_slow(key)) locale_data.currencies.append(key); }); @@ -409,7 +437,9 @@ static void parse_numeric_keywords(String locale_numbers_path, UnicodeLocaleData StringBuilder builder; builder.join(',', keyword_values); - locale.keywords.set(key, builder.build()); + + auto index = ensure_unique_string(locale_data, builder.build()); + locale.keywords.set(key, index); if (!locale_data.keywords.contains_slow(key)) locale_data.keywords.append(key); @@ -620,6 +650,7 @@ static void generate_unicode_locale_implementation(Core::File& file, UnicodeLoca { StringBuilder builder; SourceGenerator generator { builder }; + generator.set("strings_size"sv, String::number(locale_data.unique_strings.size())); generator.set("locales_size"sv, String::number(locale_data.locales.size())); generator.set("territories_size", String::number(locale_data.territories.size())); generator.set("variants_size", String::number(locale_data.max_variant_size)); @@ -641,6 +672,29 @@ struct Patterns { StringView end; StringView pair; }; +)~~~"); + + generator.append(R"~~~( +static constexpr Array s_string_list { { + {})~~~"); + + constexpr size_t max_strings_per_row = 30; + size_t strings_in_current_row = 1; + + for (auto const& string : locale_data.unique_strings) { + if (strings_in_current_row++ > 0) + generator.append(", "); + + generator.append(String::formatted("\"{}\"sv", string)); + + if (strings_in_current_row == max_strings_per_row) { + strings_in_current_row = 0; + generator.append(",\n "); + } + } + + generator.append(R"~~~( +} }; )~~~"); auto format_mapping_name = [](StringView format, StringView name) { @@ -671,15 +725,15 @@ struct Patterns { generator.append(String::formatted(" }}, {}", list.size())); }; - auto append_string_list = [&](String name, auto const& keys, auto const& mappings) { + auto append_string_index_list = [&](String name, auto const& keys, auto const& mappings) { generator.set("name", name); generator.set("size", String::number(keys.size())); generator.append(R"~~~( -static constexpr Array @name@ { { +static constexpr Array @name@ { { )~~~"); - constexpr size_t max_values_per_row = 10; + constexpr size_t max_values_per_row = 30; size_t values_in_current_row = 0; for (auto const& key : keys) { @@ -687,9 +741,9 @@ static constexpr Array @name@ { { generator.append(" "); if (auto it = mappings.find(key); it != mappings.end()) - generator.set("mapping"sv, String::formatted("\"{}\"sv", it->value)); + generator.set("mapping"sv, String::number(it->value)); else - generator.set("mapping"sv, "{}"sv); + generator.set("mapping"sv, "0"sv); generator.append("@mapping@,"); if (values_in_current_row == max_values_per_row) { @@ -766,11 +820,11 @@ static constexpr Array, @size@> @name@ { { )~~~"); }; - append_mapping("StringView"sv, "s_languages"sv, "s_languages_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.languages, value.languages); }); - append_mapping("StringView"sv, "s_territories"sv, "s_territories_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.territories, value.territories); }); - append_mapping("StringView"sv, "s_scripts"sv, "s_scripts_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.scripts, value.scripts); }); - append_mapping("StringView"sv, "s_currencies"sv, "s_currencies_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.currencies, value.currencies); }); - append_mapping("StringView"sv, "s_keywords"sv, "s_keywords_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.keywords, value.keywords); }); + append_mapping("size_t"sv, "s_languages"sv, "s_languages_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.languages, value.languages); }); + append_mapping("size_t"sv, "s_territories"sv, "s_territories_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.territories, value.territories); }); + append_mapping("size_t"sv, "s_scripts"sv, "s_scripts_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.scripts, value.scripts); }); + append_mapping("size_t"sv, "s_currencies"sv, "s_currencies_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.currencies, value.currencies); }); + append_mapping("size_t"sv, "s_keywords"sv, "s_keywords_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.keywords, value.keywords); }); append_mapping("Patterns"sv, "s_list_patterns"sv, "s_list_patterns_{}", [&](auto const& name, auto const& value) { append_list_patterns(name, value.list_patterns); }); generator.append(R"~~~( @@ -965,7 +1019,8 @@ Optional get_locale_@enum_snake@_mapping(StringView locale, StringVi auto @enum_snake@_index = to_underlying(*@enum_snake@_value); auto const& mappings = @collection_name@.at(locale_index); - auto @enum_snake@_mapping = mappings.at(@enum_snake@_index); + auto @enum_snake@_string_index = mappings.at(@enum_snake@_index); + auto @enum_snake@_mapping = s_string_list.at(@enum_snake@_string_index); if (@enum_snake@_mapping.is_empty()) return {};