From bbd80d2e4d5086eb493d5edbc0df967094f52932 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Fri, 28 Jul 2023 10:59:05 -0400 Subject: [PATCH] LibLocale: Replace some slow CLDR vector lookups with a hash map The LocaleData generator currently stores vectors of unique instances of CLDR data (e.g. languages, currencies, etc.). For each CLDR file that we parse, we linearly search through those vectors to decide if the current field being parsed is unique. Given the size of the CLDR, this adds up to quite a bit of time. Augment these vectors with a hash map to store the index of each unique instance in those vectors. This allows for quickly checking if a field is unique, and to later look up those indices. We do not apply this technique to every bit of CLDR data here. For example, CLDR::character_orders contains only 2 entries. In that case, it is quicker to search the vector than it is to hash a string key. This reduces the runtime of GenerateLocaleData from to 2.03s to 1.09s. --- .../LibLocale/GenerateLocaleData.cpp | 69 ++++++++++++++----- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp index e30d4cba3cd..aa6f39964ec 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp @@ -199,11 +199,23 @@ struct CLDR { Vector locale_aliases; Vector languages; + HashMap language_indices; + Vector territories; + HashMap territory_indices; + Vector scripts; + HashMap script_indices; + Vector variants; + HashMap variant_indices; + Vector currencies; + HashMap currency_indices; + Vector date_fields; + HashMap date_fields_indices; + Vector date_field_aliases { // ECMA-402 and the CLDR refer to some date fields with different names. Defining these aliases // means we can remain agnostic about the naming differences elsewhere. @@ -332,20 +344,29 @@ static ErrorOr parse_identity(DeprecatedString locale_path, CLDR& cldr, Lo if (territory_string.has_value()) { locale.territory = territory_string.value(); - if (!cldr.territories.contains_slow(*locale.territory)) + + if (!cldr.territory_indices.contains(*locale.territory)) { + cldr.territory_indices.set(*locale.territory, 0); cldr.territories.append(*locale.territory); + } } if (script_string.has_value()) { - auto script = script_string.value(); - if (!cldr.scripts.contains_slow(script)) + auto const& script = script_string.value(); + + if (!cldr.script_indices.contains(script)) { + cldr.script_indices.set(script, 0); cldr.scripts.append(script); + } } if (variant_string.has_value()) { locale.variant = variant_string.value(); - if (!cldr.variants.contains_slow(*locale.variant)) + + if (!cldr.variant_indices.contains(*locale.variant)) { + cldr.variant_indices.set(*locale.variant, 0); cldr.variants.append(*locale.variant); + } } return {}; @@ -387,8 +408,10 @@ static ErrorOr preprocess_languages(DeprecatedString locale_path, CLDR& cl auto const& languages_object = locale_display_names_object.get_object("languages"sv).value(); languages_object.for_each_member([&](auto const& key, auto const&) { - if (!key.contains("-alt-"sv) && !cldr.languages.contains_slow(key)) + if (!key.contains("-alt-"sv) && !cldr.language_indices.contains(key)) { + cldr.language_indices.set(key, 0); cldr.languages.append(key); + } }); return {}; @@ -406,8 +429,10 @@ static ErrorOr preprocess_currencies(DeprecatedString numbers_path, CLDR& auto const& currencies_object = locale_numbers_object.get_object("currencies"sv).value(); currencies_object.for_each_member([&](auto const& key, JsonValue const&) { - if (!cldr.currencies.contains_slow(key)) + if (!cldr.currency_indices.contains(key)) { + cldr.currency_indices.set(key, 0); cldr.currencies.append(key); + } }); return {}; @@ -435,8 +460,10 @@ static ErrorOr preprocess_date_fields(DeprecatedString dates_path, CLDR& c if (!is_sanctioned_date_field(key)) return; - if (!cldr.date_fields.contains_slow(key)) + if (!cldr.date_fields_indices.contains(key)) { + cldr.date_fields_indices.set(key, 0); cldr.date_fields.append(key); + } }); return {}; @@ -532,7 +559,7 @@ static ErrorOr parse_locale_languages(DeprecatedString locale_path, CLDR& if (key.contains("-alt-"sv)) return; - auto index = cldr.languages.find_first_index(key).value(); + auto index = cldr.language_indices.get(key).value(); languages[index] = cldr.unique_strings.ensure(value.as_string()); }); @@ -563,7 +590,7 @@ static ErrorOr parse_locale_territories(DeprecatedString locale_path, CLDR auto const& territories_object = locale_display_names_object.get_object("territories"sv).value(); territories_object.for_each_member([&](auto const& key, JsonValue const& value) { - if (auto index = cldr.territories.find_first_index(key); index.has_value()) + if (auto index = cldr.territory_indices.get(key); index.has_value()) territories[*index] = cldr.unique_strings.ensure(value.as_string()); }); @@ -594,7 +621,7 @@ static ErrorOr parse_locale_scripts(DeprecatedString locale_path, CLDR& cl auto const& scripts_object = locale_display_names_object.get_object("scripts"sv).value(); scripts_object.for_each_member([&](auto const& key, JsonValue const& value) { - if (auto index = cldr.scripts.find_first_index(key); index.has_value()) + if (auto index = cldr.script_indices.get(key); index.has_value()) scripts[*index] = cldr.unique_strings.ensure(value.as_string()); }); @@ -713,7 +740,7 @@ static ErrorOr parse_locale_currencies(DeprecatedString numbers_path, CLDR auto narrow_name = value.as_object().get_deprecated_string("symbol-alt-narrow"sv); auto numeric_name = value.as_object().get_deprecated_string("displayName-count-other"sv); - auto index = cldr.currencies.find_first_index(key).value(); + auto index = cldr.currency_indices.get(key).value(); long_currencies[index] = cldr.unique_strings.ensure(move(long_name)); short_currencies[index] = cldr.unique_strings.ensure(move(short_name)); narrow_currencies[index] = narrow_name.has_value() ? cldr.unique_strings.ensure(narrow_name.release_value()) : 0; @@ -790,7 +817,7 @@ static ErrorOr parse_locale_date_fields(DeprecatedString dates_path, CLDR& auto const& short_name = fields_object.get_object(DeprecatedString::formatted("{}-short", key))->get_deprecated_string("displayName"sv).value(); auto const& narrow_name = fields_object.get_object(DeprecatedString::formatted("{}-narrow", key))->get_deprecated_string("displayName"sv).value(); - auto index = cldr.date_fields.find_first_index(key).value(); + auto index = cldr.date_fields_indices.get(key).value(); long_date_fields[index] = cldr.unique_strings.ensure(long_name); short_date_fields[index] = cldr.unique_strings.ensure(short_name); narrow_date_fields[index] = cldr.unique_strings.ensure(narrow_name); @@ -1030,11 +1057,19 @@ static ErrorOr parse_all_locales(DeprecatedString bcp47_path, DeprecatedSt return IterationDecision::Continue; })); - quick_sort(cldr.languages); - quick_sort(cldr.territories); - quick_sort(cldr.scripts); - quick_sort(cldr.currencies); - quick_sort(cldr.date_fields); + auto update_indices = [](auto& keys, auto& indices) { + quick_sort(keys); + + for (size_t i = 0; i < keys.size(); ++i) + indices.set(keys[i], i); + }; + + update_indices(cldr.languages, cldr.language_indices); + update_indices(cldr.territories, cldr.territory_indices); + update_indices(cldr.scripts, cldr.script_indices); + update_indices(cldr.variants, cldr.variant_indices); + update_indices(cldr.currencies, cldr.currency_indices); + update_indices(cldr.date_fields, cldr.date_fields_indices); TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/bcp47", bcp47_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { auto bcp47_path = LexicalPath::join(directory.path().string(), entry.name).string();