LibJS+LibLocale: Replace locale maximization and minimization with ICU

This commit is contained in:
Timothy Flynn 2024-06-08 13:23:26 -04:00 committed by Andreas Kling
parent 9724a25daf
commit e487f91388
Notes: sideshowbarker 2024-07-18 04:46:35 +09:00
7 changed files with 81 additions and 308 deletions

View file

@ -123,11 +123,6 @@ struct LocaleData {
size_t text_layout { 0 };
};
struct LanguageMapping {
CanonicalLanguageID key {};
CanonicalLanguageID alias {};
};
struct CLDR {
UniqueStringStorage unique_strings;
UniqueStorage<KeywordList> unique_keyword_lists;
@ -144,8 +139,6 @@ struct CLDR {
Vector<ByteString> list_pattern_types;
Vector<ByteString> character_orders;
Vector<LanguageMapping> likely_subtags;
size_t max_variant_size { 0 };
};
// Some parsing is expected to fail. For example, the CLDR contains language mappings
@ -174,32 +167,6 @@ ErrorOr<JsonValue const*> read_json_file_with_cache(ByteString const& path)
return &parsed_json_cache.get(path).value();
}
static ErrorOr<LanguageMapping> parse_language_mapping(CLDR& cldr, StringView key, StringView alias)
{
auto parsed_key = TRY(CanonicalLanguageID::parse(cldr.unique_strings, key));
auto parsed_alias = TRY(CanonicalLanguageID::parse(cldr.unique_strings, alias));
return LanguageMapping { move(parsed_key), move(parsed_alias) };
}
static ErrorOr<void> parse_likely_subtags(ByteString core_supplemental_path, CLDR& cldr)
{
LexicalPath likely_subtags_path(move(core_supplemental_path));
likely_subtags_path = likely_subtags_path.append("likelySubtags.json"sv);
auto likely_subtags = TRY(read_json_file(likely_subtags_path.string()));
auto const& supplemental_object = likely_subtags.as_object().get_object("supplemental"sv).value();
auto const& likely_subtags_object = supplemental_object.get_object("likelySubtags"sv).value();
likely_subtags_object.for_each_member([&](auto const& key, JsonValue const& value) {
auto mapping = TRY_OR_DISCARD(parse_language_mapping(cldr, key, value.as_string()));
cldr.max_variant_size = max(mapping.key.variants.size(), cldr.max_variant_size);
cldr.max_variant_size = max(mapping.alias.variants.size(), cldr.max_variant_size);
cldr.likely_subtags.append(move(mapping));
});
return {};
}
static ErrorOr<void> parse_unicode_extension_keywords(ByteString bcp47_path, CLDR& cldr)
{
constexpr auto desired_keywords = Array { "ca"sv, "co"sv, "hc"sv, "kf"sv, "kn"sv, "nu"sv };
@ -531,8 +498,6 @@ static ErrorOr<void> parse_all_locales(ByteString bcp47_path, ByteString core_pa
core_supplemental_path = core_supplemental_path.append("supplemental"sv);
VERIFY(FileSystem::is_directory(core_supplemental_path.string()));
TRY(parse_likely_subtags(core_supplemental_path.string(), cldr));
auto remove_variants_from_path = [&](ByteString path) -> ErrorOr<ByteString> {
auto parsed_locale = TRY(CanonicalLanguageID::parse(cldr.unique_strings, LexicalPath::basename(path)));
@ -634,7 +599,6 @@ static ErrorOr<void> generate_unicode_locale_implementation(Core::InputBufferedF
SourceGenerator generator { builder };
generator.set("string_index_type"sv, string_index_type);
generator.set("locales_size"sv, ByteString::number(cldr.locales.size()));
generator.set("variants_size", ByteString::number(cldr.max_variant_size));
generator.append(R"~~~(
#include <AK/Array.h>
@ -715,26 +679,6 @@ ReadonlySpan<StringView> get_available_keyword_values(StringView key)
cldr.unique_list_pattern_lists.generate(generator, cldr.unique_list_patterns.type_that_fits(), "s_list_pattern_lists"sv);
cldr.unique_text_layouts.generate(generator, "TextLayout"sv, "s_text_layouts"sv, 30);
auto append_index = [&](auto index) {
generator.append(ByteString::formatted(", {}", index));
};
auto append_list_and_size = [&](auto const& list) {
if (list.is_empty()) {
generator.append(", {}, 0");
return;
}
bool first = true;
generator.append(", {");
for (auto const& item : list) {
generator.append(first ? " "sv : ", "sv);
generator.append(ByteString::number(item));
first = false;
}
generator.append(ByteString::formatted(" }}, {}", list.size()));
};
auto append_mapping = [&](auto const& keys, auto const& map, auto type, auto name, auto mapping_getter) {
generator.set("type", type);
generator.set("name", name);
@ -766,156 +710,6 @@ static constexpr Array<@type@, @size@> @name@ { {)~~~");
append_mapping(locales, cldr.locales, cldr.unique_list_pattern_lists.type_that_fits(), "s_locale_list_patterns"sv, [&](auto const& locale) { return locale.list_patterns; });
append_mapping(locales, cldr.locales, cldr.unique_text_layouts.type_that_fits(), "s_locale_text_layouts"sv, [&](auto const& locale) { return locale.text_layout; });
generator.append(R"~~~(
struct CanonicalLanguageID
{
@string_index_type@ language { 0 };
@string_index_type@ script { 0 };
@string_index_type@ region { 0 };
Array<@string_index_type@, @variants_size@> variants {};
size_t variants_size { 0 };
};
struct LanguageMapping {
CanonicalLanguageID key;
CanonicalLanguageID alias;
};
)~~~");
auto append_complex_mapping = [&](StringView name, auto& mappings) {
generator.set("size", ByteString::number(mappings.size()));
generator.set("name"sv, name);
generator.append(R"~~~(
static constexpr Array<LanguageMapping, @size@> s_@name@ { {
)~~~");
quick_sort(mappings, [&](auto const& lhs, auto const& rhs) {
auto const& lhs_language = cldr.unique_strings.get(lhs.key.language);
auto const& rhs_language = cldr.unique_strings.get(rhs.key.language);
// Sort the keys such that "und" language tags are at the end, as those are less specific.
if (lhs_language.starts_with("und"sv) && !rhs_language.starts_with("und"sv))
return false;
if (!lhs_language.starts_with("und"sv) && rhs_language.starts_with("und"sv))
return true;
return lhs_language < rhs_language;
});
for (auto const& mapping : mappings) {
generator.set("language"sv, ByteString::number(mapping.key.language));
generator.append(" { { @language@");
append_index(mapping.key.script);
append_index(mapping.key.region);
append_list_and_size(mapping.key.variants);
generator.set("language"sv, ByteString::number(mapping.alias.language));
generator.append(" }, { @language@");
append_index(mapping.alias.script);
append_index(mapping.alias.region);
append_list_and_size(mapping.alias.variants);
generator.append(" } },\n");
}
generator.append("} };\n");
};
append_complex_mapping("likely_subtags"sv, cldr.likely_subtags);
generator.append(R"~~~(
static LanguageMapping const* resolve_likely_subtag(LanguageID const& language_id)
{
// https://unicode.org/reports/tr35/#Likely_Subtags
enum class State {
LanguageScriptRegion,
LanguageRegion,
LanguageScript,
Language,
UndScript,
Done,
};
auto state = State::LanguageScriptRegion;
while (state != State::Done) {
LanguageID search_key;
switch (state) {
case State::LanguageScriptRegion:
state = State::LanguageRegion;
if (!language_id.script.has_value() || !language_id.region.has_value())
continue;
search_key.language = *language_id.language;
search_key.script = *language_id.script;
search_key.region = *language_id.region;
break;
case State::LanguageRegion:
state = State::LanguageScript;
if (!language_id.region.has_value())
continue;
search_key.language = *language_id.language;
search_key.region = *language_id.region;
break;
case State::LanguageScript:
state = State::Language;
if (!language_id.script.has_value())
continue;
search_key.language = *language_id.language;
search_key.script = *language_id.script;
break;
case State::Language:
state = State::UndScript;
search_key.language = *language_id.language;
break;
case State::UndScript:
state = State::Done;
if (!language_id.script.has_value())
continue;
search_key.language = "und"_string;
search_key.script = *language_id.script;
break;
default:
VERIFY_NOT_REACHED();
}
for (auto const& map : s_likely_subtags) {
auto const& key_language = decode_string(map.key.language);
auto const& key_script = decode_string(map.key.script);
auto const& key_region = decode_string(map.key.region);
if (key_language != search_key.language)
continue;
if (!key_script.is_empty() || search_key.script.has_value()) {
if (key_script != search_key.script)
continue;
}
if (!key_region.is_empty() || search_key.region.has_value()) {
if (key_region != search_key.region)
continue;
}
return &map;
}
}
return nullptr;
}
)~~~");
auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values, Vector<Alias> const& aliases = {}) -> ErrorOr<void> {
HashValueMap<ByteString> hashes;
TRY(hashes.try_ensure_capacity(values.size()));
@ -1094,33 +888,6 @@ Optional<CharacterOrder> character_order_for_locale(StringView locale)
return {};
}
Optional<LanguageID> add_likely_subtags(LanguageID const& language_id)
{
// https://www.unicode.org/reports/tr35/#Likely_Subtags
auto const* likely_subtag = resolve_likely_subtag(language_id);
if (likely_subtag == nullptr)
return OptionalNone {};
auto maximized = language_id;
auto key_script = decode_string(likely_subtag->key.script);
auto key_region = decode_string(likely_subtag->key.region);
auto alias_language = decode_string(likely_subtag->alias.language);
auto alias_script = decode_string(likely_subtag->alias.script);
auto alias_region = decode_string(likely_subtag->alias.region);
if (maximized.language == "und"sv)
maximized.language = MUST(String::from_utf8(alias_language));
if (!maximized.script.has_value() || (!key_script.is_empty() && !alias_script.is_empty()))
maximized.script = MUST(String::from_utf8(alias_script));
if (!maximized.region.has_value() || (!key_region.is_empty() && !alias_region.is_empty()))
maximized.region = MUST(String::from_utf8(alias_region));
return maximized;
}
}
)~~~");

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -16,32 +16,17 @@ namespace JS::Intl {
JS_DEFINE_ALLOCATOR(Locale);
NonnullGCPtr<Locale> Locale::create(Realm& realm, ::Locale::LocaleID locale_id)
NonnullGCPtr<Locale> Locale::create(Realm& realm, NonnullGCPtr<Locale> source_locale, String locale_tag)
{
auto locale = realm.heap().allocate<Locale>(realm, realm.intrinsics().intl_locale_prototype());
locale->set_locale(locale_id.to_string());
for (auto& extension : locale_id.extensions) {
if (!extension.has<::Locale::LocaleExtension>())
continue;
for (auto& keyword : extension.get<::Locale::LocaleExtension>().keywords) {
if (keyword.key == "ca"sv)
locale->set_calendar(move(keyword.value));
else if (keyword.key == "co"sv)
locale->set_collation(move(keyword.value));
else if (keyword.key == "hc"sv)
locale->set_hour_cycle(move(keyword.value));
else if (keyword.key == "kf"sv)
locale->set_case_first(move(keyword.value));
else if (keyword.key == "kn"sv)
locale->set_numeric(keyword.value.is_empty());
else if (keyword.key == "nu"sv)
locale->set_numbering_system(move(keyword.value));
}
break;
}
locale->set_locale(move(locale_tag));
locale->m_calendar = source_locale->m_calendar;
locale->m_case_first = source_locale->m_case_first;
locale->m_collation = source_locale->m_collation;
locale->m_hour_cycle = source_locale->m_hour_cycle;
locale->m_numbering_system = source_locale->m_numbering_system;
locale->m_numeric = source_locale->m_numeric;
return locale;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -23,7 +23,7 @@ class Locale final : public Object {
JS_DECLARE_ALLOCATOR(Locale);
public:
static NonnullGCPtr<Locale> create(Realm&, ::Locale::LocaleID);
static NonnullGCPtr<Locale> create(Realm&, NonnullGCPtr<Locale> source_locale, String);
static constexpr auto relevant_extension_keys()
{

View file

@ -64,15 +64,11 @@ JS_DEFINE_NATIVE_FUNCTION(LocalePrototype::maximize)
// 2. Perform ? RequireInternalSlot(loc, [[InitializedLocale]]).
auto locale_object = TRY(typed_this_object(vm));
auto locale = ::Locale::parse_unicode_locale_id(locale_object->locale());
VERIFY(locale.has_value());
// 3. Let maximal be the result of the Add Likely Subtags algorithm applied to loc.[[Locale]]. If an error is signaled, set maximal to loc.[[Locale]].
if (auto maximal = ::Locale::add_likely_subtags(locale->language_id); maximal.has_value())
locale->language_id = maximal.release_value();
auto maximal = ::Locale::add_likely_subtags(locale_object->locale()).value_or(locale_object->locale());
// 4. Return ! Construct(%Locale%, maximal).
return Locale::create(realm, locale.release_value());
return Locale::create(realm, locale_object, move(maximal));
}
// 14.3.4 Intl.Locale.prototype.minimize ( ), https://tc39.es/ecma402/#sec-Intl.Locale.prototype.minimize
@ -84,15 +80,11 @@ JS_DEFINE_NATIVE_FUNCTION(LocalePrototype::minimize)
// 2. Perform ? RequireInternalSlot(loc, [[InitializedLocale]]).
auto locale_object = TRY(typed_this_object(vm));
auto locale = ::Locale::parse_unicode_locale_id(locale_object->locale());
VERIFY(locale.has_value());
// 3. Let minimal be the result of the Remove Likely Subtags algorithm applied to loc.[[Locale]]. If an error is signaled, set minimal to loc.[[Locale]].
if (auto minimal = ::Locale::remove_likely_subtags(locale->language_id); minimal.has_value())
locale->language_id = minimal.release_value();
auto minimal = ::Locale::remove_likely_subtags(locale_object->locale()).value_or(locale_object->locale());
// 4. Return ! Construct(%Locale%, minimal).
return Locale::create(realm, locale.release_value());
return Locale::create(realm, locale_object, move(minimal));
}
// 14.3.5 Intl.Locale.prototype.toString ( ), https://tc39.es/ecma402/#sec-Intl.Locale.prototype.toString

View file

@ -113,8 +113,11 @@ static T find_regional_values_for_locale(StringView locale, GetRegionalValues&&
if (!language.has_value())
return return_default_values();
if (!language->region.has_value())
language = add_likely_subtags(*language);
if (!language->region.has_value()) {
if (auto maximized = add_likely_subtags(language->to_string()); maximized.has_value())
language = parse_unicode_language_id(*maximized);
}
if (!language.has_value() || !language->region.has_value())
return return_default_values();

View file

@ -600,48 +600,74 @@ Optional<ListPatterns> __attribute__((weak)) get_locale_list_patterns(StringView
Optional<CharacterOrder> __attribute__((weak)) character_order_from_string(StringView) { return {}; }
StringView __attribute__((weak)) character_order_to_string(CharacterOrder) { return {}; }
Optional<CharacterOrder> __attribute__((weak)) character_order_for_locale(StringView) { return {}; }
Optional<LanguageID> __attribute__((weak)) add_likely_subtags(LanguageID const&) { return {}; }
Optional<LanguageID> remove_likely_subtags(LanguageID const& language_id)
static void apply_extensions_to_locale(icu::Locale& locale, icu::Locale const& locale_with_extensions)
{
// https://www.unicode.org/reports/tr35/#Likely_Subtags
auto return_language_and_variants = [](auto language, auto variants) {
language.variants = move(variants);
return language;
};
UErrorCode status = U_ZERO_ERROR;
// 1. First get max = AddLikelySubtags(inputLocale). If an error is signaled, return it.
auto maximized = add_likely_subtags(language_id);
if (!maximized.has_value())
icu::LocaleBuilder builder;
builder.setLocale(locale_with_extensions);
builder.setLanguage(locale.getLanguage());
builder.setRegion(locale.getCountry());
builder.setScript(locale.getScript());
builder.setVariant(locale.getVariant());
locale = builder.build(status);
VERIFY(icu_success(status));
}
Optional<String> add_likely_subtags(StringView locale)
{
UErrorCode status = U_ZERO_ERROR;
auto locale_data = LocaleData::for_locale(locale);
if (!locale_data.has_value())
return {};
// 2. Remove the variants from max.
auto variants = move(maximized->variants);
// ICU doesn't seem to handle maximizing locales that have keywords. For example, "und-x-private" should become
// "en-Latn-US-x-private" (in the same manner that "und" becomes "en-Latn-US"). So here, we maximize the locale
// without keywords, then add them back if needed.
auto maximized = icu::Locale::createFromName(locale_data->locale().getBaseName());
// 3. Get the components of the max (languagemax, scriptmax, regionmax).
auto language_max = maximized->language;
auto script_max = maximized->script;
auto region_max = maximized->region;
// 4. Then for trial in {languagemax, languagemax_regionmax, languagemax_scriptmax}:
// If AddLikelySubtags(trial) = max, then return trial + variants.
auto run_trial = [&](Optional<String> language, Optional<String> script, Optional<String> region) -> Optional<LanguageID> {
LanguageID trial { .language = move(language), .script = move(script), .region = move(region) };
if (add_likely_subtags(trial) == maximized)
return return_language_and_variants(move(trial), move(variants));
maximized.addLikelySubtags(status);
if (icu_failure(status))
return {};
};
if (auto trial = run_trial(language_max, {}, {}); trial.has_value())
return trial;
if (auto trial = run_trial(language_max, {}, region_max); trial.has_value())
return trial;
if (auto trial = run_trial(language_max, script_max, {}); trial.has_value())
return trial;
if (strlen(locale_data->locale().getName()) != strlen(locale_data->locale().getBaseName()))
apply_extensions_to_locale(maximized, locale_data->locale());
// 5. If you do not get a match, return max + variants.
return return_language_and_variants(maximized.release_value(), move(variants));
auto result = maximized.toLanguageTag<StringBuilder>(status);
if (icu_failure(status))
return {};
return MUST(result.to_string());
}
Optional<String> remove_likely_subtags(StringView locale)
{
UErrorCode status = U_ZERO_ERROR;
auto locale_data = LocaleData::for_locale(locale);
if (!locale_data.has_value())
return {};
// ICU doesn't seem to handle minimizing locales that have keywords. For example, "und-x-private" should become
// "en-x-private" (in the same manner that "und" becomes "en"). So here, we minimize the locale without keywords,
// then add them back if needed.
auto minimized = icu::Locale::createFromName(locale_data->locale().getBaseName());
minimized.minimizeSubtags(status);
if (icu_failure(status))
return {};
if (strlen(locale_data->locale().getName()) != strlen(locale_data->locale().getBaseName()))
apply_extensions_to_locale(minimized, locale_data->locale());
auto result = minimized.toLanguageTag<StringBuilder>(status);
if (icu_failure(status))
return {};
return MUST(result.to_string());
}
String LanguageID::to_string() const

View file

@ -177,7 +177,7 @@ Optional<CharacterOrder> character_order_from_string(StringView character_order)
StringView character_order_to_string(CharacterOrder character_order);
Optional<CharacterOrder> character_order_for_locale(StringView locale);
Optional<LanguageID> add_likely_subtags(LanguageID const& language_id);
Optional<LanguageID> remove_likely_subtags(LanguageID const& language_id);
Optional<String> add_likely_subtags(StringView);
Optional<String> remove_likely_subtags(StringView);
}