mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 15:40:19 +00:00
LibLocale: Replace some slow CLDR vector lookups with a hash map
The LocaleData generator currently stores vectors of unique instances of CLDR data (e.g. languages, currencies, etc.). For each CLDR file that we parse, we linearly search through those vectors to decide if the current field being parsed is unique. Given the size of the CLDR, this adds up to quite a bit of time. Augment these vectors with a hash map to store the index of each unique instance in those vectors. This allows for quickly checking if a field is unique, and to later look up those indices. We do not apply this technique to every bit of CLDR data here. For example, CLDR::character_orders contains only 2 entries. In that case, it is quicker to search the vector than it is to hash a string key. This reduces the runtime of GenerateLocaleData from to 2.03s to 1.09s.
This commit is contained in:
parent
e86769c068
commit
bbd80d2e4d
Notes:
sideshowbarker
2024-07-17 03:19:14 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/bbd80d2e4d Pull-request: https://github.com/SerenityOS/serenity/pull/20239 Reviewed-by: https://github.com/gmta ✅
1 changed files with 52 additions and 17 deletions
|
@ -199,11 +199,23 @@ struct CLDR {
|
|||
Vector<Alias> locale_aliases;
|
||||
|
||||
Vector<DeprecatedString> languages;
|
||||
HashMap<StringView, size_t> language_indices;
|
||||
|
||||
Vector<DeprecatedString> territories;
|
||||
HashMap<StringView, size_t> territory_indices;
|
||||
|
||||
Vector<DeprecatedString> scripts;
|
||||
HashMap<StringView, size_t> script_indices;
|
||||
|
||||
Vector<DeprecatedString> variants;
|
||||
HashMap<StringView, size_t> variant_indices;
|
||||
|
||||
Vector<DeprecatedString> currencies;
|
||||
HashMap<StringView, size_t> currency_indices;
|
||||
|
||||
Vector<DeprecatedString> date_fields;
|
||||
HashMap<StringView, size_t> date_fields_indices;
|
||||
|
||||
Vector<Alias> date_field_aliases {
|
||||
// ECMA-402 and the CLDR refer to some date fields with different names. Defining these aliases
|
||||
// means we can remain agnostic about the naming differences elsewhere.
|
||||
|
@ -332,20 +344,29 @@ static ErrorOr<void> parse_identity(DeprecatedString locale_path, CLDR& cldr, Lo
|
|||
|
||||
if (territory_string.has_value()) {
|
||||
locale.territory = territory_string.value();
|
||||
if (!cldr.territories.contains_slow(*locale.territory))
|
||||
|
||||
if (!cldr.territory_indices.contains(*locale.territory)) {
|
||||
cldr.territory_indices.set(*locale.territory, 0);
|
||||
cldr.territories.append(*locale.territory);
|
||||
}
|
||||
}
|
||||
|
||||
if (script_string.has_value()) {
|
||||
auto script = script_string.value();
|
||||
if (!cldr.scripts.contains_slow(script))
|
||||
auto const& script = script_string.value();
|
||||
|
||||
if (!cldr.script_indices.contains(script)) {
|
||||
cldr.script_indices.set(script, 0);
|
||||
cldr.scripts.append(script);
|
||||
}
|
||||
}
|
||||
|
||||
if (variant_string.has_value()) {
|
||||
locale.variant = variant_string.value();
|
||||
if (!cldr.variants.contains_slow(*locale.variant))
|
||||
|
||||
if (!cldr.variant_indices.contains(*locale.variant)) {
|
||||
cldr.variant_indices.set(*locale.variant, 0);
|
||||
cldr.variants.append(*locale.variant);
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
|
@ -387,8 +408,10 @@ static ErrorOr<void> preprocess_languages(DeprecatedString locale_path, CLDR& cl
|
|||
auto const& languages_object = locale_display_names_object.get_object("languages"sv).value();
|
||||
|
||||
languages_object.for_each_member([&](auto const& key, auto const&) {
|
||||
if (!key.contains("-alt-"sv) && !cldr.languages.contains_slow(key))
|
||||
if (!key.contains("-alt-"sv) && !cldr.language_indices.contains(key)) {
|
||||
cldr.language_indices.set(key, 0);
|
||||
cldr.languages.append(key);
|
||||
}
|
||||
});
|
||||
|
||||
return {};
|
||||
|
@ -406,8 +429,10 @@ static ErrorOr<void> preprocess_currencies(DeprecatedString numbers_path, CLDR&
|
|||
auto const& currencies_object = locale_numbers_object.get_object("currencies"sv).value();
|
||||
|
||||
currencies_object.for_each_member([&](auto const& key, JsonValue const&) {
|
||||
if (!cldr.currencies.contains_slow(key))
|
||||
if (!cldr.currency_indices.contains(key)) {
|
||||
cldr.currency_indices.set(key, 0);
|
||||
cldr.currencies.append(key);
|
||||
}
|
||||
});
|
||||
|
||||
return {};
|
||||
|
@ -435,8 +460,10 @@ static ErrorOr<void> preprocess_date_fields(DeprecatedString dates_path, CLDR& c
|
|||
if (!is_sanctioned_date_field(key))
|
||||
return;
|
||||
|
||||
if (!cldr.date_fields.contains_slow(key))
|
||||
if (!cldr.date_fields_indices.contains(key)) {
|
||||
cldr.date_fields_indices.set(key, 0);
|
||||
cldr.date_fields.append(key);
|
||||
}
|
||||
});
|
||||
|
||||
return {};
|
||||
|
@ -532,7 +559,7 @@ static ErrorOr<void> parse_locale_languages(DeprecatedString locale_path, CLDR&
|
|||
if (key.contains("-alt-"sv))
|
||||
return;
|
||||
|
||||
auto index = cldr.languages.find_first_index(key).value();
|
||||
auto index = cldr.language_indices.get(key).value();
|
||||
languages[index] = cldr.unique_strings.ensure(value.as_string());
|
||||
});
|
||||
|
||||
|
@ -563,7 +590,7 @@ static ErrorOr<void> parse_locale_territories(DeprecatedString locale_path, CLDR
|
|||
auto const& territories_object = locale_display_names_object.get_object("territories"sv).value();
|
||||
|
||||
territories_object.for_each_member([&](auto const& key, JsonValue const& value) {
|
||||
if (auto index = cldr.territories.find_first_index(key); index.has_value())
|
||||
if (auto index = cldr.territory_indices.get(key); index.has_value())
|
||||
territories[*index] = cldr.unique_strings.ensure(value.as_string());
|
||||
});
|
||||
|
||||
|
@ -594,7 +621,7 @@ static ErrorOr<void> parse_locale_scripts(DeprecatedString locale_path, CLDR& cl
|
|||
auto const& scripts_object = locale_display_names_object.get_object("scripts"sv).value();
|
||||
|
||||
scripts_object.for_each_member([&](auto const& key, JsonValue const& value) {
|
||||
if (auto index = cldr.scripts.find_first_index(key); index.has_value())
|
||||
if (auto index = cldr.script_indices.get(key); index.has_value())
|
||||
scripts[*index] = cldr.unique_strings.ensure(value.as_string());
|
||||
});
|
||||
|
||||
|
@ -713,7 +740,7 @@ static ErrorOr<void> parse_locale_currencies(DeprecatedString numbers_path, CLDR
|
|||
auto narrow_name = value.as_object().get_deprecated_string("symbol-alt-narrow"sv);
|
||||
auto numeric_name = value.as_object().get_deprecated_string("displayName-count-other"sv);
|
||||
|
||||
auto index = cldr.currencies.find_first_index(key).value();
|
||||
auto index = cldr.currency_indices.get(key).value();
|
||||
long_currencies[index] = cldr.unique_strings.ensure(move(long_name));
|
||||
short_currencies[index] = cldr.unique_strings.ensure(move(short_name));
|
||||
narrow_currencies[index] = narrow_name.has_value() ? cldr.unique_strings.ensure(narrow_name.release_value()) : 0;
|
||||
|
@ -790,7 +817,7 @@ static ErrorOr<void> parse_locale_date_fields(DeprecatedString dates_path, CLDR&
|
|||
auto const& short_name = fields_object.get_object(DeprecatedString::formatted("{}-short", key))->get_deprecated_string("displayName"sv).value();
|
||||
auto const& narrow_name = fields_object.get_object(DeprecatedString::formatted("{}-narrow", key))->get_deprecated_string("displayName"sv).value();
|
||||
|
||||
auto index = cldr.date_fields.find_first_index(key).value();
|
||||
auto index = cldr.date_fields_indices.get(key).value();
|
||||
long_date_fields[index] = cldr.unique_strings.ensure(long_name);
|
||||
short_date_fields[index] = cldr.unique_strings.ensure(short_name);
|
||||
narrow_date_fields[index] = cldr.unique_strings.ensure(narrow_name);
|
||||
|
@ -1030,11 +1057,19 @@ static ErrorOr<void> parse_all_locales(DeprecatedString bcp47_path, DeprecatedSt
|
|||
return IterationDecision::Continue;
|
||||
}));
|
||||
|
||||
quick_sort(cldr.languages);
|
||||
quick_sort(cldr.territories);
|
||||
quick_sort(cldr.scripts);
|
||||
quick_sort(cldr.currencies);
|
||||
quick_sort(cldr.date_fields);
|
||||
auto update_indices = [](auto& keys, auto& indices) {
|
||||
quick_sort(keys);
|
||||
|
||||
for (size_t i = 0; i < keys.size(); ++i)
|
||||
indices.set(keys[i], i);
|
||||
};
|
||||
|
||||
update_indices(cldr.languages, cldr.language_indices);
|
||||
update_indices(cldr.territories, cldr.territory_indices);
|
||||
update_indices(cldr.scripts, cldr.script_indices);
|
||||
update_indices(cldr.variants, cldr.variant_indices);
|
||||
update_indices(cldr.currencies, cldr.currency_indices);
|
||||
update_indices(cldr.date_fields, cldr.date_fields_indices);
|
||||
|
||||
TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/bcp47", bcp47_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr<IterationDecision> {
|
||||
auto bcp47_path = LexicalPath::join(directory.path().string(), entry.name).string();
|
||||
|
|
Loading…
Reference in a new issue