LibUnicode: Generate and use a set of unique locale-related strings
In the generated UnicodeLocale.cpp file, there are 296,408 strings for localizations of languages, territories, scripts, currencies & keywords. Of these, only 43,848 (14.8%) are actually unique, so there are quite a large number of duplicated strings. This generates a single compile-time array to store these strings. The arrays for the localizations now store an index into this single array rather than duplicating any strings.
This commit is contained in:
parent
3f0095b57a
commit
f9e605397c
Notes:
sideshowbarker
2024-07-18 02:50:53 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/f9e605397c1 Pull-request: https://github.com/SerenityOS/serenity/pull/10423
1 changed files with 76 additions and 21 deletions
|
@ -34,11 +34,11 @@ struct Locale {
|
||||||
String language;
|
String language;
|
||||||
Optional<String> territory;
|
Optional<String> territory;
|
||||||
Optional<String> variant;
|
Optional<String> variant;
|
||||||
HashMap<String, String> languages;
|
HashMap<String, size_t> languages;
|
||||||
HashMap<String, String> territories;
|
HashMap<String, size_t> territories;
|
||||||
HashMap<String, String> scripts;
|
HashMap<String, size_t> scripts;
|
||||||
HashMap<String, String> currencies;
|
HashMap<String, size_t> currencies;
|
||||||
HashMap<String, String> keywords;
|
HashMap<String, size_t> keywords;
|
||||||
Vector<ListPatterns> list_patterns;
|
Vector<ListPatterns> list_patterns;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -55,6 +55,8 @@ struct LanguageMapping {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct UnicodeLocaleData {
|
struct UnicodeLocaleData {
|
||||||
|
Vector<String> unique_strings;
|
||||||
|
HashMap<StringView, size_t> unique_string_indices;
|
||||||
HashMap<String, Locale> locales;
|
HashMap<String, Locale> locales;
|
||||||
Vector<String> languages;
|
Vector<String> languages;
|
||||||
Vector<String> territories;
|
Vector<String> territories;
|
||||||
|
@ -74,6 +76,25 @@ struct UnicodeLocaleData {
|
||||||
size_t max_variant_size { 0 };
|
size_t max_variant_size { 0 };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static size_t ensure_unique_string(UnicodeLocaleData& locale_data, String string)
|
||||||
|
{
|
||||||
|
// We maintain a set of unique strings in two structures: a vector which owns the unique string,
|
||||||
|
// and a hash map which maps that string to its index in the vector. The vector is to ensure the
|
||||||
|
// strings are generated in an easily known order, and the map is to allow quickly deciding if a
|
||||||
|
// string is actually unique (otherwise, we'd have to linear-search the vector for each string).
|
||||||
|
//
|
||||||
|
// Also note that index 0 will be reserved for the empty string, so the index returned from this
|
||||||
|
// method is actually the real index in the vector + 1.
|
||||||
|
if (auto index = locale_data.unique_string_indices.get(string); index.has_value())
|
||||||
|
return *index;
|
||||||
|
|
||||||
|
locale_data.unique_strings.append(move(string));
|
||||||
|
size_t index = locale_data.unique_strings.size();
|
||||||
|
|
||||||
|
locale_data.unique_string_indices.set(locale_data.unique_strings.last(), index);
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
static Optional<CanonicalLanguageID> parse_language(StringView language)
|
static Optional<CanonicalLanguageID> parse_language(StringView language)
|
||||||
{
|
{
|
||||||
CanonicalLanguageID language_id {};
|
CanonicalLanguageID language_id {};
|
||||||
|
@ -248,7 +269,8 @@ static void parse_locale_languages(String locale_path, UnicodeLocaleData& locale
|
||||||
if (!locale_data.languages.contains_slow(key))
|
if (!locale_data.languages.contains_slow(key))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
locale.languages.set(key, value.as_string());
|
size_t index = ensure_unique_string(locale_data, value.as_string());
|
||||||
|
locale.languages.set(key, index);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -273,7 +295,8 @@ static void parse_locale_territories(String locale_path, UnicodeLocaleData& loca
|
||||||
if (!locale_data.territories.contains_slow(key))
|
if (!locale_data.territories.contains_slow(key))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
locale.territories.set(key, value.as_string());
|
size_t index = ensure_unique_string(locale_data, value.as_string());
|
||||||
|
locale.territories.set(key, index);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -295,7 +318,9 @@ static void parse_locale_scripts(String locale_path, UnicodeLocaleData& locale_d
|
||||||
auto const& scripts_object = locale_display_names_object.as_object().get("scripts"sv);
|
auto const& scripts_object = locale_display_names_object.as_object().get("scripts"sv);
|
||||||
|
|
||||||
scripts_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
|
scripts_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
|
||||||
locale.scripts.set(key, value.as_string());
|
size_t index = ensure_unique_string(locale_data, value.as_string());
|
||||||
|
locale.scripts.set(key, index);
|
||||||
|
|
||||||
if (!locale_data.scripts.contains_slow(key))
|
if (!locale_data.scripts.contains_slow(key))
|
||||||
locale_data.scripts.append(key);
|
locale_data.scripts.append(key);
|
||||||
});
|
});
|
||||||
|
@ -372,7 +397,10 @@ static void parse_locale_currencies(String numbers_path, UnicodeLocaleData& loca
|
||||||
|
|
||||||
currencies_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
|
currencies_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
|
||||||
auto const& display_name = value.as_object().get("displayName"sv);
|
auto const& display_name = value.as_object().get("displayName"sv);
|
||||||
locale.currencies.set(key, display_name.as_string());
|
|
||||||
|
size_t index = ensure_unique_string(locale_data, display_name.as_string());
|
||||||
|
locale.currencies.set(key, index);
|
||||||
|
|
||||||
if (!locale_data.currencies.contains_slow(key))
|
if (!locale_data.currencies.contains_slow(key))
|
||||||
locale_data.currencies.append(key);
|
locale_data.currencies.append(key);
|
||||||
});
|
});
|
||||||
|
@ -409,7 +437,9 @@ static void parse_numeric_keywords(String locale_numbers_path, UnicodeLocaleData
|
||||||
|
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
builder.join(',', keyword_values);
|
builder.join(',', keyword_values);
|
||||||
locale.keywords.set(key, builder.build());
|
|
||||||
|
auto index = ensure_unique_string(locale_data, builder.build());
|
||||||
|
locale.keywords.set(key, index);
|
||||||
|
|
||||||
if (!locale_data.keywords.contains_slow(key))
|
if (!locale_data.keywords.contains_slow(key))
|
||||||
locale_data.keywords.append(key);
|
locale_data.keywords.append(key);
|
||||||
|
@ -620,6 +650,7 @@ static void generate_unicode_locale_implementation(Core::File& file, UnicodeLoca
|
||||||
{
|
{
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
SourceGenerator generator { builder };
|
SourceGenerator generator { builder };
|
||||||
|
generator.set("strings_size"sv, String::number(locale_data.unique_strings.size()));
|
||||||
generator.set("locales_size"sv, String::number(locale_data.locales.size()));
|
generator.set("locales_size"sv, String::number(locale_data.locales.size()));
|
||||||
generator.set("territories_size", String::number(locale_data.territories.size()));
|
generator.set("territories_size", String::number(locale_data.territories.size()));
|
||||||
generator.set("variants_size", String::number(locale_data.max_variant_size));
|
generator.set("variants_size", String::number(locale_data.max_variant_size));
|
||||||
|
@ -641,6 +672,29 @@ struct Patterns {
|
||||||
StringView end;
|
StringView end;
|
||||||
StringView pair;
|
StringView pair;
|
||||||
};
|
};
|
||||||
|
)~~~");
|
||||||
|
|
||||||
|
generator.append(R"~~~(
|
||||||
|
static constexpr Array<StringView, @strings_size@ + 1> s_string_list { {
|
||||||
|
{})~~~");
|
||||||
|
|
||||||
|
constexpr size_t max_strings_per_row = 30;
|
||||||
|
size_t strings_in_current_row = 1;
|
||||||
|
|
||||||
|
for (auto const& string : locale_data.unique_strings) {
|
||||||
|
if (strings_in_current_row++ > 0)
|
||||||
|
generator.append(", ");
|
||||||
|
|
||||||
|
generator.append(String::formatted("\"{}\"sv", string));
|
||||||
|
|
||||||
|
if (strings_in_current_row == max_strings_per_row) {
|
||||||
|
strings_in_current_row = 0;
|
||||||
|
generator.append(",\n ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
generator.append(R"~~~(
|
||||||
|
} };
|
||||||
)~~~");
|
)~~~");
|
||||||
|
|
||||||
auto format_mapping_name = [](StringView format, StringView name) {
|
auto format_mapping_name = [](StringView format, StringView name) {
|
||||||
|
@ -671,15 +725,15 @@ struct Patterns {
|
||||||
generator.append(String::formatted(" }}, {}", list.size()));
|
generator.append(String::formatted(" }}, {}", list.size()));
|
||||||
};
|
};
|
||||||
|
|
||||||
auto append_string_list = [&](String name, auto const& keys, auto const& mappings) {
|
auto append_string_index_list = [&](String name, auto const& keys, auto const& mappings) {
|
||||||
generator.set("name", name);
|
generator.set("name", name);
|
||||||
generator.set("size", String::number(keys.size()));
|
generator.set("size", String::number(keys.size()));
|
||||||
|
|
||||||
generator.append(R"~~~(
|
generator.append(R"~~~(
|
||||||
static constexpr Array<StringView, @size@> @name@ { {
|
static constexpr Array<size_t, @size@> @name@ { {
|
||||||
)~~~");
|
)~~~");
|
||||||
|
|
||||||
constexpr size_t max_values_per_row = 10;
|
constexpr size_t max_values_per_row = 30;
|
||||||
size_t values_in_current_row = 0;
|
size_t values_in_current_row = 0;
|
||||||
|
|
||||||
for (auto const& key : keys) {
|
for (auto const& key : keys) {
|
||||||
|
@ -687,9 +741,9 @@ static constexpr Array<StringView, @size@> @name@ { {
|
||||||
generator.append(" ");
|
generator.append(" ");
|
||||||
|
|
||||||
if (auto it = mappings.find(key); it != mappings.end())
|
if (auto it = mappings.find(key); it != mappings.end())
|
||||||
generator.set("mapping"sv, String::formatted("\"{}\"sv", it->value));
|
generator.set("mapping"sv, String::number(it->value));
|
||||||
else
|
else
|
||||||
generator.set("mapping"sv, "{}"sv);
|
generator.set("mapping"sv, "0"sv);
|
||||||
generator.append("@mapping@,");
|
generator.append("@mapping@,");
|
||||||
|
|
||||||
if (values_in_current_row == max_values_per_row) {
|
if (values_in_current_row == max_values_per_row) {
|
||||||
|
@ -766,11 +820,11 @@ static constexpr Array<Span<@type@ const>, @size@> @name@ { {
|
||||||
)~~~");
|
)~~~");
|
||||||
};
|
};
|
||||||
|
|
||||||
append_mapping("StringView"sv, "s_languages"sv, "s_languages_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.languages, value.languages); });
|
append_mapping("size_t"sv, "s_languages"sv, "s_languages_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.languages, value.languages); });
|
||||||
append_mapping("StringView"sv, "s_territories"sv, "s_territories_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.territories, value.territories); });
|
append_mapping("size_t"sv, "s_territories"sv, "s_territories_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.territories, value.territories); });
|
||||||
append_mapping("StringView"sv, "s_scripts"sv, "s_scripts_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.scripts, value.scripts); });
|
append_mapping("size_t"sv, "s_scripts"sv, "s_scripts_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.scripts, value.scripts); });
|
||||||
append_mapping("StringView"sv, "s_currencies"sv, "s_currencies_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.currencies, value.currencies); });
|
append_mapping("size_t"sv, "s_currencies"sv, "s_currencies_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.currencies, value.currencies); });
|
||||||
append_mapping("StringView"sv, "s_keywords"sv, "s_keywords_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.keywords, value.keywords); });
|
append_mapping("size_t"sv, "s_keywords"sv, "s_keywords_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.keywords, value.keywords); });
|
||||||
append_mapping("Patterns"sv, "s_list_patterns"sv, "s_list_patterns_{}", [&](auto const& name, auto const& value) { append_list_patterns(name, value.list_patterns); });
|
append_mapping("Patterns"sv, "s_list_patterns"sv, "s_list_patterns_{}", [&](auto const& name, auto const& value) { append_list_patterns(name, value.list_patterns); });
|
||||||
|
|
||||||
generator.append(R"~~~(
|
generator.append(R"~~~(
|
||||||
|
@ -965,7 +1019,8 @@ Optional<StringView> get_locale_@enum_snake@_mapping(StringView locale, StringVi
|
||||||
auto @enum_snake@_index = to_underlying(*@enum_snake@_value);
|
auto @enum_snake@_index = to_underlying(*@enum_snake@_value);
|
||||||
|
|
||||||
auto const& mappings = @collection_name@.at(locale_index);
|
auto const& mappings = @collection_name@.at(locale_index);
|
||||||
auto @enum_snake@_mapping = mappings.at(@enum_snake@_index);
|
auto @enum_snake@_string_index = mappings.at(@enum_snake@_index);
|
||||||
|
auto @enum_snake@_mapping = s_string_list.at(@enum_snake@_string_index);
|
||||||
|
|
||||||
if (@enum_snake@_mapping.is_empty())
|
if (@enum_snake@_mapping.is_empty())
|
||||||
return {};
|
return {};
|
||||||
|
|
Loading…
Add table
Reference in a new issue