From 1fbc5dba08062e6d5702e49cf24fc869a7de304d Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 31 Aug 2021 09:40:24 -0400 Subject: [PATCH] LibUnicode: Generate Unicode locale likely subtag data CLDR contains a set of likely subtag data where, given a locale, you can resolve what is the most likely language, script, or territory of that locale. This data is needed for resolving territory aliases. These aliases might contain multiple territories, and we need to resolve which of those territories is most likely correct for a locale. Note that the likely subtag data is quite huge (a few thousand entries). As an optimization encouraged by the spec, we only generate the smallest subset of this data that we actually need (about 150 entries). --- .../LibUnicode/GenerateUnicodeLocale.cpp | 134 +++++++++++++++++- Userland/Libraries/LibUnicode/Locale.cpp | 15 ++ Userland/Libraries/LibUnicode/Locale.h | 2 + 3 files changed, 149 insertions(+), 2 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp index 93d2cd5263a..256f4f429d6 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp @@ -43,6 +43,8 @@ struct UnicodeLocaleData { HashMap variant_aliases; HashMap subdivision_aliases; HashMap complex_mappings; + HashMap likely_subtags; + Vector likely_territory_subtags; }; static void write_to_file_if_different(Core::File& file, StringView contents) @@ -73,10 +75,13 @@ static void parse_core_aliases(String core_supplemental_path, UnicodeLocaleData& auto const& metadata_object = supplemental_object.as_object().get("metadata"sv); auto const& alias_object = metadata_object.as_object().get("alias"sv); - auto append_aliases = [&](auto& alias_object, auto& alias_map) { + auto append_aliases = [&](auto& alias_object, auto& alias_map, Vector* likely_subtags_list = nullptr) { alias_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) { auto alias = value.as_object().get("_replacement"sv).as_string(); + if (auto aliases = alias.split(' '); likely_subtags_list && (aliases.size() > 1)) + likely_subtags_list->extend(move(aliases)); + if (key.contains('-')) locale_data.complex_mappings.set(key, move(alias)); else @@ -85,12 +90,46 @@ static void parse_core_aliases(String core_supplemental_path, UnicodeLocaleData& }; append_aliases(alias_object.as_object().get("languageAlias"sv), locale_data.language_aliases); - append_aliases(alias_object.as_object().get("territoryAlias"sv), locale_data.territory_aliases); + append_aliases(alias_object.as_object().get("territoryAlias"sv), locale_data.territory_aliases, &locale_data.likely_territory_subtags); append_aliases(alias_object.as_object().get("scriptAlias"sv), locale_data.script_aliases); append_aliases(alias_object.as_object().get("variantAlias"sv), locale_data.variant_aliases); append_aliases(alias_object.as_object().get("subdivisionAlias"sv), locale_data.subdivision_aliases); } +static void parse_likely_subtags(String core_supplemental_path, UnicodeLocaleData& locale_data) +{ + LexicalPath likely_subtags_path(move(core_supplemental_path)); + likely_subtags_path = likely_subtags_path.append("likelySubtags.json"sv); + VERIFY(Core::File::exists(likely_subtags_path.string())); + + auto likely_subtags_file_or_error = Core::File::open(likely_subtags_path.string(), Core::OpenMode::ReadOnly); + VERIFY(!likely_subtags_file_or_error.is_error()); + + auto likely_subtags = JsonParser(likely_subtags_file_or_error.value()->read_all()).parse(); + VERIFY(likely_subtags.has_value()); + + auto const& supplemental_object = likely_subtags->as_object().get("supplemental"sv); + auto const& likely_subtags_object = supplemental_object.as_object().get("likelySubtags"sv); + + likely_subtags_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) { + auto likely_subtag = value.as_string(); + + auto regions = likely_subtag.split('-'); + VERIFY(regions.size() == 3); + + // Unicode TR35 has the following footnote in section 3.2.1 Canonical Unicode Locale Identifiers + // + // Formally, replacement of multiple territories uses Section 4.3 Likely Subtags. However, there are a small + // number of cases of multiple territories, so the mappings can be precomputed. This results in a faster + // lookup with a very small subset of the likely subtags data. + // + // Since the likely subtags data is quite large, and resolving likely territory subtags is our only use case for + // this data, we only generate likely subtags that contain one of the above multiple territories. + if (locale_data.likely_territory_subtags.contains_slow(regions[2])) + locale_data.likely_subtags.set(key, move(likely_subtag)); + }); +} + static void parse_identity(String locale_path, UnicodeLocaleData& locale_data, Locale& locale) { LexicalPath languages_path(move(locale_path)); // Note: Every JSON file defines identity data, so we can use any of them. @@ -245,6 +284,7 @@ static void parse_all_locales(String core_path, String locale_names_path, String VERIFY(Core::File::is_directory(core_supplemental_path.string())); parse_core_aliases(core_supplemental_path.string(), locale_data); + parse_likely_subtags(core_supplemental_path.string(), locale_data); while (locale_names_iterator.has_next()) { auto locale_path = locale_names_iterator.next_full_path(); @@ -349,6 +389,7 @@ Optional resolve_variant_alias(StringView const& variant); Optional resolve_subdivision_alias(StringView const& subdivision); void resolve_complex_language_aliases(Unicode::LanguageID& language_id); +Optional resolve_most_likely_territory(Unicode::LanguageID const& language_id); } @@ -503,8 +544,90 @@ static auto const& ensure_@name@_map() }; append_complex_mapping("complex_alias"sv, locale_data.complex_mappings); + append_complex_mapping("likely_subtags"sv, locale_data.likely_subtags); generator.append(R"~~~( +static Unicode::LanguageID const* resolve_likely_subtag(Unicode::LanguageID const& language_id) +{ + // https://unicode.org/reports/tr35/#Likely_Subtags + static auto const& likely_subtags_map = ensure_likely_subtags_map(); + + enum class State { + LanguageScriptRegion, + LanguageRegion, + LanguageScript, + Language, + UndScript, + Done, + }; + + auto state = State::LanguageScriptRegion; + + while (state != State::Done) { + Unicode::LanguageID search_key; + + switch (state) { + case State::LanguageScriptRegion: + state = State::LanguageRegion; + if (!language_id.script.has_value() || !language_id.region.has_value()) + continue; + + search_key.language = language_id.language; + search_key.script = language_id.script; + search_key.region = language_id.region; + break; + + case State::LanguageRegion: + state = State::LanguageScript; + if (!language_id.region.has_value()) + continue; + + search_key.language = language_id.language; + search_key.region = language_id.region; + break; + + case State::LanguageScript: + state = State::Language; + if (!language_id.script.has_value()) + continue; + + search_key.language = language_id.language; + search_key.script = language_id.script; + break; + + case State::Language: + state = State::UndScript; + search_key.language = language_id.language; + break; + + case State::UndScript: + state = State::Done; + if (!language_id.script.has_value()) + continue; + + search_key.language = "und"sv; + search_key.script = language_id.script; + break; + + default: + VERIFY_NOT_REACHED(); + } + + for (auto const& map : likely_subtags_map) { + if (map.key.language != search_key.language) + continue; + if (map.key.script != search_key.script) + continue; + if (map.key.region != search_key.region) + continue; + + return &map.alias; + } + } + + return nullptr; +} + namespace Detail { )~~~"); @@ -649,6 +772,13 @@ void resolve_complex_language_aliases(Unicode::LanguageID& language_id) } } +Optional resolve_most_likely_territory(Unicode::LanguageID const& language_id) +{ + if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr) + return likely_subtag->region; + return {}; +} + } } diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index 291b6f07825..88e86ce7f64 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -860,4 +860,19 @@ Optional resolve_subdivision_alias(StringView subdivision) #endif } +String resolve_most_likely_territory([[maybe_unused]] LanguageID const& language_id, StringView territory_alias) +{ + auto aliases = territory_alias.split_view(' '); + +#if ENABLE_UNICODE_DATA + if (aliases.size() > 1) { + auto territory = Detail::resolve_most_likely_territory(language_id); + if (territory.has_value() && aliases.contains_slow(*territory)) + return territory.release_value(); + } +#endif + + return aliases[0].to_string(); +} + } diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index de5bcb0b8b7..7cff3ad5f48 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -81,4 +81,6 @@ Optional resolve_script_tag_alias(StringView script_tag); Optional resolve_variant_alias(StringView variant); Optional resolve_subdivision_alias(StringView subdivision); +String resolve_most_likely_territory(LanguageID const& language_id, StringView territory_alias); + }