mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 15:40:19 +00:00
LibUnicode: Generate Unicode locale likely subtag data
CLDR contains a set of likely subtag data where, given a locale, you can resolve what is the most likely language, script, or territory of that locale. This data is needed for resolving territory aliases. These aliases might contain multiple territories, and we need to resolve which of those territories is most likely correct for a locale. Note that the likely subtag data is quite huge (a few thousand entries). As an optimization encouraged by the spec, we only generate the smallest subset of this data that we actually need (about 150 entries).
This commit is contained in:
parent
72f49e42b4
commit
1fbc5dba08
Notes:
sideshowbarker
2024-07-18 04:57:08 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/1fbc5dba080 Pull-request: https://github.com/SerenityOS/serenity/pull/9705 Reviewed-by: https://github.com/linusg ✅
3 changed files with 149 additions and 2 deletions
|
@ -43,6 +43,8 @@ struct UnicodeLocaleData {
|
||||||
HashMap<String, String> variant_aliases;
|
HashMap<String, String> variant_aliases;
|
||||||
HashMap<String, String> subdivision_aliases;
|
HashMap<String, String> subdivision_aliases;
|
||||||
HashMap<String, String> complex_mappings;
|
HashMap<String, String> complex_mappings;
|
||||||
|
HashMap<String, String> likely_subtags;
|
||||||
|
Vector<String> likely_territory_subtags;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void write_to_file_if_different(Core::File& file, StringView contents)
|
static void write_to_file_if_different(Core::File& file, StringView contents)
|
||||||
|
@ -73,10 +75,13 @@ static void parse_core_aliases(String core_supplemental_path, UnicodeLocaleData&
|
||||||
auto const& metadata_object = supplemental_object.as_object().get("metadata"sv);
|
auto const& metadata_object = supplemental_object.as_object().get("metadata"sv);
|
||||||
auto const& alias_object = metadata_object.as_object().get("alias"sv);
|
auto const& alias_object = metadata_object.as_object().get("alias"sv);
|
||||||
|
|
||||||
auto append_aliases = [&](auto& alias_object, auto& alias_map) {
|
auto append_aliases = [&](auto& alias_object, auto& alias_map, Vector<String>* likely_subtags_list = nullptr) {
|
||||||
alias_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
|
alias_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
|
||||||
auto alias = value.as_object().get("_replacement"sv).as_string();
|
auto alias = value.as_object().get("_replacement"sv).as_string();
|
||||||
|
|
||||||
|
if (auto aliases = alias.split(' '); likely_subtags_list && (aliases.size() > 1))
|
||||||
|
likely_subtags_list->extend(move(aliases));
|
||||||
|
|
||||||
if (key.contains('-'))
|
if (key.contains('-'))
|
||||||
locale_data.complex_mappings.set(key, move(alias));
|
locale_data.complex_mappings.set(key, move(alias));
|
||||||
else
|
else
|
||||||
|
@ -85,12 +90,46 @@ static void parse_core_aliases(String core_supplemental_path, UnicodeLocaleData&
|
||||||
};
|
};
|
||||||
|
|
||||||
append_aliases(alias_object.as_object().get("languageAlias"sv), locale_data.language_aliases);
|
append_aliases(alias_object.as_object().get("languageAlias"sv), locale_data.language_aliases);
|
||||||
append_aliases(alias_object.as_object().get("territoryAlias"sv), locale_data.territory_aliases);
|
append_aliases(alias_object.as_object().get("territoryAlias"sv), locale_data.territory_aliases, &locale_data.likely_territory_subtags);
|
||||||
append_aliases(alias_object.as_object().get("scriptAlias"sv), locale_data.script_aliases);
|
append_aliases(alias_object.as_object().get("scriptAlias"sv), locale_data.script_aliases);
|
||||||
append_aliases(alias_object.as_object().get("variantAlias"sv), locale_data.variant_aliases);
|
append_aliases(alias_object.as_object().get("variantAlias"sv), locale_data.variant_aliases);
|
||||||
append_aliases(alias_object.as_object().get("subdivisionAlias"sv), locale_data.subdivision_aliases);
|
append_aliases(alias_object.as_object().get("subdivisionAlias"sv), locale_data.subdivision_aliases);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void parse_likely_subtags(String core_supplemental_path, UnicodeLocaleData& locale_data)
|
||||||
|
{
|
||||||
|
LexicalPath likely_subtags_path(move(core_supplemental_path));
|
||||||
|
likely_subtags_path = likely_subtags_path.append("likelySubtags.json"sv);
|
||||||
|
VERIFY(Core::File::exists(likely_subtags_path.string()));
|
||||||
|
|
||||||
|
auto likely_subtags_file_or_error = Core::File::open(likely_subtags_path.string(), Core::OpenMode::ReadOnly);
|
||||||
|
VERIFY(!likely_subtags_file_or_error.is_error());
|
||||||
|
|
||||||
|
auto likely_subtags = JsonParser(likely_subtags_file_or_error.value()->read_all()).parse();
|
||||||
|
VERIFY(likely_subtags.has_value());
|
||||||
|
|
||||||
|
auto const& supplemental_object = likely_subtags->as_object().get("supplemental"sv);
|
||||||
|
auto const& likely_subtags_object = supplemental_object.as_object().get("likelySubtags"sv);
|
||||||
|
|
||||||
|
likely_subtags_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
|
||||||
|
auto likely_subtag = value.as_string();
|
||||||
|
|
||||||
|
auto regions = likely_subtag.split('-');
|
||||||
|
VERIFY(regions.size() == 3);
|
||||||
|
|
||||||
|
// Unicode TR35 has the following footnote in section 3.2.1 Canonical Unicode Locale Identifiers
|
||||||
|
//
|
||||||
|
// Formally, replacement of multiple territories uses Section 4.3 Likely Subtags. However, there are a small
|
||||||
|
// number of cases of multiple territories, so the mappings can be precomputed. This results in a faster
|
||||||
|
// lookup with a very small subset of the likely subtags data.
|
||||||
|
//
|
||||||
|
// Since the likely subtags data is quite large, and resolving likely territory subtags is our only use case for
|
||||||
|
// this data, we only generate likely subtags that contain one of the above multiple territories.
|
||||||
|
if (locale_data.likely_territory_subtags.contains_slow(regions[2]))
|
||||||
|
locale_data.likely_subtags.set(key, move(likely_subtag));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
static void parse_identity(String locale_path, UnicodeLocaleData& locale_data, Locale& locale)
|
static void parse_identity(String locale_path, UnicodeLocaleData& locale_data, Locale& locale)
|
||||||
{
|
{
|
||||||
LexicalPath languages_path(move(locale_path)); // Note: Every JSON file defines identity data, so we can use any of them.
|
LexicalPath languages_path(move(locale_path)); // Note: Every JSON file defines identity data, so we can use any of them.
|
||||||
|
@ -245,6 +284,7 @@ static void parse_all_locales(String core_path, String locale_names_path, String
|
||||||
VERIFY(Core::File::is_directory(core_supplemental_path.string()));
|
VERIFY(Core::File::is_directory(core_supplemental_path.string()));
|
||||||
|
|
||||||
parse_core_aliases(core_supplemental_path.string(), locale_data);
|
parse_core_aliases(core_supplemental_path.string(), locale_data);
|
||||||
|
parse_likely_subtags(core_supplemental_path.string(), locale_data);
|
||||||
|
|
||||||
while (locale_names_iterator.has_next()) {
|
while (locale_names_iterator.has_next()) {
|
||||||
auto locale_path = locale_names_iterator.next_full_path();
|
auto locale_path = locale_names_iterator.next_full_path();
|
||||||
|
@ -349,6 +389,7 @@ Optional<StringView> resolve_variant_alias(StringView const& variant);
|
||||||
Optional<StringView> resolve_subdivision_alias(StringView const& subdivision);
|
Optional<StringView> resolve_subdivision_alias(StringView const& subdivision);
|
||||||
|
|
||||||
void resolve_complex_language_aliases(Unicode::LanguageID& language_id);
|
void resolve_complex_language_aliases(Unicode::LanguageID& language_id);
|
||||||
|
Optional<String> resolve_most_likely_territory(Unicode::LanguageID const& language_id);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -503,8 +544,90 @@ static auto const& ensure_@name@_map()
|
||||||
};
|
};
|
||||||
|
|
||||||
append_complex_mapping("complex_alias"sv, locale_data.complex_mappings);
|
append_complex_mapping("complex_alias"sv, locale_data.complex_mappings);
|
||||||
|
append_complex_mapping("likely_subtags"sv, locale_data.likely_subtags);
|
||||||
|
|
||||||
generator.append(R"~~~(
|
generator.append(R"~~~(
|
||||||
|
static Unicode::LanguageID const* resolve_likely_subtag(Unicode::LanguageID const& language_id)
|
||||||
|
{
|
||||||
|
// https://unicode.org/reports/tr35/#Likely_Subtags
|
||||||
|
static auto const& likely_subtags_map = ensure_likely_subtags_map();
|
||||||
|
|
||||||
|
enum class State {
|
||||||
|
LanguageScriptRegion,
|
||||||
|
LanguageRegion,
|
||||||
|
LanguageScript,
|
||||||
|
Language,
|
||||||
|
UndScript,
|
||||||
|
Done,
|
||||||
|
};
|
||||||
|
|
||||||
|
auto state = State::LanguageScriptRegion;
|
||||||
|
|
||||||
|
while (state != State::Done) {
|
||||||
|
Unicode::LanguageID search_key;
|
||||||
|
|
||||||
|
switch (state) {
|
||||||
|
case State::LanguageScriptRegion:
|
||||||
|
state = State::LanguageRegion;
|
||||||
|
if (!language_id.script.has_value() || !language_id.region.has_value())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
search_key.language = language_id.language;
|
||||||
|
search_key.script = language_id.script;
|
||||||
|
search_key.region = language_id.region;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::LanguageRegion:
|
||||||
|
state = State::LanguageScript;
|
||||||
|
if (!language_id.region.has_value())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
search_key.language = language_id.language;
|
||||||
|
search_key.region = language_id.region;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::LanguageScript:
|
||||||
|
state = State::Language;
|
||||||
|
if (!language_id.script.has_value())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
search_key.language = language_id.language;
|
||||||
|
search_key.script = language_id.script;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::Language:
|
||||||
|
state = State::UndScript;
|
||||||
|
search_key.language = language_id.language;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::UndScript:
|
||||||
|
state = State::Done;
|
||||||
|
if (!language_id.script.has_value())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
search_key.language = "und"sv;
|
||||||
|
search_key.script = language_id.script;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto const& map : likely_subtags_map) {
|
||||||
|
if (map.key.language != search_key.language)
|
||||||
|
continue;
|
||||||
|
if (map.key.script != search_key.script)
|
||||||
|
continue;
|
||||||
|
if (map.key.region != search_key.region)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
return &map.alias;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
namespace Detail {
|
namespace Detail {
|
||||||
)~~~");
|
)~~~");
|
||||||
|
|
||||||
|
@ -649,6 +772,13 @@ void resolve_complex_language_aliases(Unicode::LanguageID& language_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Optional<String> resolve_most_likely_territory(Unicode::LanguageID const& language_id)
|
||||||
|
{
|
||||||
|
if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr)
|
||||||
|
return likely_subtag->region;
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -860,4 +860,19 @@ Optional<StringView> resolve_subdivision_alias(StringView subdivision)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String resolve_most_likely_territory([[maybe_unused]] LanguageID const& language_id, StringView territory_alias)
|
||||||
|
{
|
||||||
|
auto aliases = territory_alias.split_view(' ');
|
||||||
|
|
||||||
|
#if ENABLE_UNICODE_DATA
|
||||||
|
if (aliases.size() > 1) {
|
||||||
|
auto territory = Detail::resolve_most_likely_territory(language_id);
|
||||||
|
if (territory.has_value() && aliases.contains_slow(*territory))
|
||||||
|
return territory.release_value();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return aliases[0].to_string();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,4 +81,6 @@ Optional<StringView> resolve_script_tag_alias(StringView script_tag);
|
||||||
Optional<StringView> resolve_variant_alias(StringView variant);
|
Optional<StringView> resolve_variant_alias(StringView variant);
|
||||||
Optional<StringView> resolve_subdivision_alias(StringView subdivision);
|
Optional<StringView> resolve_subdivision_alias(StringView subdivision);
|
||||||
|
|
||||||
|
String resolve_most_likely_territory(LanguageID const& language_id, StringView territory_alias);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue