LibLocale: Replace some slow CLDR vector lookups with a hash map

The LocaleData generator currently stores vectors of unique instances of
CLDR data (e.g. languages, currencies, etc.). For each CLDR file that we
parse, we linearly search through those vectors to decide if the current
field being parsed is unique. Given the size of the CLDR, this adds up
to quite a bit of time.

Augment these vectors with a hash map to store the index of each unique
instance in those vectors. This allows for quickly checking if a field
is unique, and to later look up those indices.

We do not apply this technique to every bit of CLDR data here. For
example, CLDR::character_orders contains only 2 entries. In that case,
it is quicker to search the vector than it is to hash a string key.

This reduces the runtime of GenerateLocaleData from to 2.03s to 1.09s.
This commit is contained in:
Timothy Flynn 2023-07-28 10:59:05 -04:00 committed by Jelle Raaijmakers
parent e86769c068
commit bbd80d2e4d
Notes: sideshowbarker 2024-07-17 03:19:14 +09:00

View file

@ -199,11 +199,23 @@ struct CLDR {
Vector<Alias> locale_aliases;
Vector<DeprecatedString> languages;
HashMap<StringView, size_t> language_indices;
Vector<DeprecatedString> territories;
HashMap<StringView, size_t> territory_indices;
Vector<DeprecatedString> scripts;
HashMap<StringView, size_t> script_indices;
Vector<DeprecatedString> variants;
HashMap<StringView, size_t> variant_indices;
Vector<DeprecatedString> currencies;
HashMap<StringView, size_t> currency_indices;
Vector<DeprecatedString> date_fields;
HashMap<StringView, size_t> date_fields_indices;
Vector<Alias> date_field_aliases {
// ECMA-402 and the CLDR refer to some date fields with different names. Defining these aliases
// means we can remain agnostic about the naming differences elsewhere.
@ -332,20 +344,29 @@ static ErrorOr<void> parse_identity(DeprecatedString locale_path, CLDR& cldr, Lo
if (territory_string.has_value()) {
locale.territory = territory_string.value();
if (!cldr.territories.contains_slow(*locale.territory))
if (!cldr.territory_indices.contains(*locale.territory)) {
cldr.territory_indices.set(*locale.territory, 0);
cldr.territories.append(*locale.territory);
}
}
if (script_string.has_value()) {
auto script = script_string.value();
if (!cldr.scripts.contains_slow(script))
auto const& script = script_string.value();
if (!cldr.script_indices.contains(script)) {
cldr.script_indices.set(script, 0);
cldr.scripts.append(script);
}
}
if (variant_string.has_value()) {
locale.variant = variant_string.value();
if (!cldr.variants.contains_slow(*locale.variant))
if (!cldr.variant_indices.contains(*locale.variant)) {
cldr.variant_indices.set(*locale.variant, 0);
cldr.variants.append(*locale.variant);
}
}
return {};
@ -387,8 +408,10 @@ static ErrorOr<void> preprocess_languages(DeprecatedString locale_path, CLDR& cl
auto const& languages_object = locale_display_names_object.get_object("languages"sv).value();
languages_object.for_each_member([&](auto const& key, auto const&) {
if (!key.contains("-alt-"sv) && !cldr.languages.contains_slow(key))
if (!key.contains("-alt-"sv) && !cldr.language_indices.contains(key)) {
cldr.language_indices.set(key, 0);
cldr.languages.append(key);
}
});
return {};
@ -406,8 +429,10 @@ static ErrorOr<void> preprocess_currencies(DeprecatedString numbers_path, CLDR&
auto const& currencies_object = locale_numbers_object.get_object("currencies"sv).value();
currencies_object.for_each_member([&](auto const& key, JsonValue const&) {
if (!cldr.currencies.contains_slow(key))
if (!cldr.currency_indices.contains(key)) {
cldr.currency_indices.set(key, 0);
cldr.currencies.append(key);
}
});
return {};
@ -435,8 +460,10 @@ static ErrorOr<void> preprocess_date_fields(DeprecatedString dates_path, CLDR& c
if (!is_sanctioned_date_field(key))
return;
if (!cldr.date_fields.contains_slow(key))
if (!cldr.date_fields_indices.contains(key)) {
cldr.date_fields_indices.set(key, 0);
cldr.date_fields.append(key);
}
});
return {};
@ -532,7 +559,7 @@ static ErrorOr<void> parse_locale_languages(DeprecatedString locale_path, CLDR&
if (key.contains("-alt-"sv))
return;
auto index = cldr.languages.find_first_index(key).value();
auto index = cldr.language_indices.get(key).value();
languages[index] = cldr.unique_strings.ensure(value.as_string());
});
@ -563,7 +590,7 @@ static ErrorOr<void> parse_locale_territories(DeprecatedString locale_path, CLDR
auto const& territories_object = locale_display_names_object.get_object("territories"sv).value();
territories_object.for_each_member([&](auto const& key, JsonValue const& value) {
if (auto index = cldr.territories.find_first_index(key); index.has_value())
if (auto index = cldr.territory_indices.get(key); index.has_value())
territories[*index] = cldr.unique_strings.ensure(value.as_string());
});
@ -594,7 +621,7 @@ static ErrorOr<void> parse_locale_scripts(DeprecatedString locale_path, CLDR& cl
auto const& scripts_object = locale_display_names_object.get_object("scripts"sv).value();
scripts_object.for_each_member([&](auto const& key, JsonValue const& value) {
if (auto index = cldr.scripts.find_first_index(key); index.has_value())
if (auto index = cldr.script_indices.get(key); index.has_value())
scripts[*index] = cldr.unique_strings.ensure(value.as_string());
});
@ -713,7 +740,7 @@ static ErrorOr<void> parse_locale_currencies(DeprecatedString numbers_path, CLDR
auto narrow_name = value.as_object().get_deprecated_string("symbol-alt-narrow"sv);
auto numeric_name = value.as_object().get_deprecated_string("displayName-count-other"sv);
auto index = cldr.currencies.find_first_index(key).value();
auto index = cldr.currency_indices.get(key).value();
long_currencies[index] = cldr.unique_strings.ensure(move(long_name));
short_currencies[index] = cldr.unique_strings.ensure(move(short_name));
narrow_currencies[index] = narrow_name.has_value() ? cldr.unique_strings.ensure(narrow_name.release_value()) : 0;
@ -790,7 +817,7 @@ static ErrorOr<void> parse_locale_date_fields(DeprecatedString dates_path, CLDR&
auto const& short_name = fields_object.get_object(DeprecatedString::formatted("{}-short", key))->get_deprecated_string("displayName"sv).value();
auto const& narrow_name = fields_object.get_object(DeprecatedString::formatted("{}-narrow", key))->get_deprecated_string("displayName"sv).value();
auto index = cldr.date_fields.find_first_index(key).value();
auto index = cldr.date_fields_indices.get(key).value();
long_date_fields[index] = cldr.unique_strings.ensure(long_name);
short_date_fields[index] = cldr.unique_strings.ensure(short_name);
narrow_date_fields[index] = cldr.unique_strings.ensure(narrow_name);
@ -1030,11 +1057,19 @@ static ErrorOr<void> parse_all_locales(DeprecatedString bcp47_path, DeprecatedSt
return IterationDecision::Continue;
}));
quick_sort(cldr.languages);
quick_sort(cldr.territories);
quick_sort(cldr.scripts);
quick_sort(cldr.currencies);
quick_sort(cldr.date_fields);
auto update_indices = [](auto& keys, auto& indices) {
quick_sort(keys);
for (size_t i = 0; i < keys.size(); ++i)
indices.set(keys[i], i);
};
update_indices(cldr.languages, cldr.language_indices);
update_indices(cldr.territories, cldr.territory_indices);
update_indices(cldr.scripts, cldr.script_indices);
update_indices(cldr.variants, cldr.variant_indices);
update_indices(cldr.currencies, cldr.currency_indices);
update_indices(cldr.date_fields, cldr.date_fields_indices);
TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/bcp47", bcp47_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr<IterationDecision> {
auto bcp47_path = LexicalPath::join(directory.path().string(), entry.name).string();