diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index 53ffddbf2cc..a82b9eae2b8 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -138,7 +138,7 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension) auto const& expected_keyword = expected_extension.keywords[i]; EXPECT_EQ(actual_keyword.key, expected_keyword.key); - EXPECT_EQ(actual_keyword.types, expected_keyword.types); + EXPECT_EQ(actual_keyword.value, expected_keyword.value); } }; @@ -153,15 +153,15 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension) fail("en-u-xxxxx-"sv); fail("en-u-xxxxxxxxx"sv); - pass("en-u-xx"sv, { {}, { { "xx"sv, {} } } }); + pass("en-u-xx"sv, { {}, { { "xx"sv, ""sv } } }); pass("en-u-xx-yyyy"sv, { {}, { { "xx"sv, { "yyyy"sv } } } }); - pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } } } }); - pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } }, { "aa"sv, {} } } }); + pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, "yyyy-zzzz"sv } } }); + pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, "yyyy-zzzz"sv }, { "aa"sv, ""sv } } }); pass("en-u-xxx"sv, { { "xxx"sv }, {} }); pass("en-u-fff-gggg"sv, { { "fff"sv, "gggg"sv }, {} }); - pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, {} } } }); - pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, { "yyyy"sv } } } }); - pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } }); + pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, ""sv } } }); + pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, "yyyy"sv } } }); + pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, "yyyy"sv } } }); } TEST_CASE(parse_unicode_locale_id_with_transformed_extension) @@ -192,7 +192,7 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension) auto const& expected_field = expected_extension.fields[i]; EXPECT_EQ(actual_field.key, expected_field.key); - EXPECT_EQ(actual_field.values, expected_field.values); + EXPECT_EQ(actual_field.value, expected_field.value); } }; @@ -225,9 +225,9 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension) pass("en-t-en-us-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv, { "posix"sv } }, {} }); pass("en-t-en-latn-us-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv, { "posix"sv } }, {} }); pass("en-t-k0-aaa"sv, { {}, { { "k0"sv, { "aaa"sv } } } }); - pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv, "bbbb" } } } }); - pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, { "bbbb"sv } } } }); - pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } }); + pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, "aaa-bbbb"sv } } }); + pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, "bbbb"sv } } }); + pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, "aaa"sv } } }); } TEST_CASE(parse_unicode_locale_id_with_other_extension) @@ -243,7 +243,7 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension) auto const& actual_extension = locale_id->extensions[0].get(); EXPECT_EQ(actual_extension.key, expected_extension.key); - EXPECT_EQ(actual_extension.values, expected_extension.values); + EXPECT_EQ(actual_extension.value, expected_extension.value); }; fail("en-z"sv); @@ -259,9 +259,9 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension) fail("en-z-aaa-a"sv); fail("en-0-aaa-a"sv); - pass("en-z-aa", { 'z', { "aa"sv } }); - pass("en-z-aa-bbb", { 'z', { "aa"sv, "bbb"sv } }); - pass("en-z-aa-bbb-cccccccc", { 'z', { "aa"sv, "bbb"sv, "cccccccc"sv } }); + pass("en-z-aa", { 'z', "aa"sv }); + pass("en-z-aa-bbb", { 'z', "aa-bbb"sv }); + pass("en-z-aa-bbb-cccccccc", { 'z', "aa-bbb-cccccccc"sv }); } TEST_CASE(parse_unicode_locale_id_with_private_use_extension) @@ -320,8 +320,12 @@ TEST_CASE(canonicalize_unicode_locale_id) test("EN-U-CCC-BBB-2K-AAA-1K-BBB"sv, "en-u-bbb-ccc-1k-bbb-2k-aaa"sv); test("en-u-1k-true"sv, "en-u-1k"sv); test("EN-U-1K-TRUE"sv, "en-u-1k"sv); + test("en-u-1k-true-abcd"sv, "en-u-1k-true-abcd"sv); + test("EN-U-1K-TRUE-ABCD"sv, "en-u-1k-true-abcd"sv); test("en-u-kb-yes"sv, "en-u-kb"sv); test("EN-U-KB-YES"sv, "en-u-kb"sv); + test("en-u-kb-yes-abcd"sv, "en-u-kb-yes-abcd"sv); + test("EN-U-KB-YES-ABCD"sv, "en-u-kb-yes-abcd"sv); test("en-u-ka-yes"sv, "en-u-ka-yes"sv); test("EN-U-KA-YES"sv, "en-u-ka-yes"sv); test("en-u-1k-names"sv, "en-u-1k-names"sv); diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp b/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp index 4136c8419ad..23c7be195d8 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp @@ -4,7 +4,6 @@ * SPDX-License-Identifier: BSD-2-Clause */ -#include #include #include #include @@ -27,29 +26,23 @@ Locale::Locale(Unicode::LocaleID const& locale_id, Object& prototype) { set_locale(locale_id.to_string()); - auto join_keyword_types = [](auto const& types) { - StringBuilder builder; - builder.join('-', types); - return builder.build(); - }; - for (auto const& extension : locale_id.extensions) { if (!extension.has()) continue; for (auto const& keyword : extension.get().keywords) { if (keyword.key == "ca"sv) { - set_calendar(join_keyword_types(keyword.types)); + set_calendar(keyword.value); } else if (keyword.key == "co"sv) { - set_collation(join_keyword_types(keyword.types)); + set_collation(keyword.value); } else if (keyword.key == "hc"sv) { - set_hour_cycle(join_keyword_types(keyword.types)); + set_hour_cycle(keyword.value); } else if (keyword.key == "kf"sv) { - set_case_first(join_keyword_types(keyword.types)); + set_case_first(keyword.value); } else if (keyword.key == "kn"sv) { - set_numeric(keyword.types.is_empty()); + set_numeric(keyword.value.is_empty()); } else if (keyword.key == "nu"sv) { - set_numbering_system(join_keyword_types(keyword.types)); + set_numbering_system(keyword.value); } } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp b/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp index 3ee98bb1b64..51ddb8eed5f 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -193,9 +192,7 @@ static LocaleAndKeys apply_unicode_extension_to_tag(StringView tag, LocaleAndKey entry = &(*it); // ii. Let value be entry.[[Value]]. - StringBuilder builder; - builder.join('-', entry->types); - value = builder.build(); + value = entry->value; } // c. Else, // i. Let entry be empty. @@ -213,12 +210,12 @@ static LocaleAndKeys apply_unicode_extension_to_tag(StringView tag, LocaleAndKey // iii. If entry is not empty, then if (entry != nullptr) { // 1. Set entry.[[Value]] to value. - entry->types = value->split('-'); + entry->value = *value; } // iv. Else, else { // 1. Append the Record { [[Key]]: key, [[Value]]: value } to keywords. - keywords.append({ key, value->split('-') }); + keywords.append({ key, *value }); } } diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index 11263eb2a72..55c89700b0f 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -211,6 +211,7 @@ static Optional parse_unicode_locale_extension(GenericLexer& le case ParseState::ParsingKeyword: { // keyword = key (sep type)? Keyword keyword { .key = *segment }; + Vector keyword_values; if (!is_key(*segment)) { lexer.retreat(segment->length() + 1); @@ -227,9 +228,13 @@ static Optional parse_unicode_locale_extension(GenericLexer& le break; } - keyword.types.append(*type); + keyword_values.append(*type); } + StringBuilder builder; + builder.join('-', keyword_values); + keyword.value = builder.build(); + locale_extension.keywords.append(move(keyword)); break; } @@ -283,6 +288,7 @@ static Optional parse_transformed_extension(GenericLexer& case ParseState::ParsingField: { // tfield = tkey tvalue; TransformedField field { .key = *segment }; + Vector field_values; if (!is_transformed_key(*segment)) { lexer.retreat(segment->length() + 1); @@ -299,12 +305,16 @@ static Optional parse_transformed_extension(GenericLexer& break; } - field.values.append(*value); + field_values.append(*value); } - if (field.values.is_empty()) + if (field_values.is_empty()) return {}; + StringBuilder builder; + builder.join('-', field_values); + field.value = builder.build(); + transformed_extension.fields.append(move(field)); break; } @@ -325,6 +335,7 @@ static Optional parse_other_extension(char key, GenericLexer& le // // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; OtherExtension other_extension { .key = key }; + Vector other_values; if (!is_ascii_alphanumeric(key) || (key == 'x') || (key == 'X')) return {}; @@ -339,11 +350,16 @@ static Optional parse_other_extension(char key, GenericLexer& le break; } - other_extension.values.append(*segment); + other_values.append(*segment); } - if (other_extension.values.is_empty()) + if (other_values.is_empty()) return {}; + + StringBuilder builder; + builder.join('-', other_values); + other_extension.value = builder.build(); + return other_extension; } @@ -469,8 +485,11 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/transform.xml // // There isn't yet a counterpart in the JSON export. See: https://unicode-org.atlassian.net/browse/CLDR-14571 - if ((key == "ca"sv) && (value == "islamicc"sv)) { - value = "islamic-civil"sv; + if (key == "ca"sv) { + if (value == "islamicc"sv) + value = "islamic-civil"sv; + else if (value == "ethiopic-amete-alem"sv) + value = "ethioaa"sv; } else if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) { value = "true"sv; } else if (key == "ks"sv) { @@ -521,20 +540,6 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu } } -static void perform_hard_coded_key_multi_value_substitutions(String const& key, Vector& values) -{ - // Similar to perform_hard_coded_key_value_substitutions, some aliases depend on multiple - // variants being present in the original locale. Those are canonicalized separately here. - // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml - if ((key != "ca"sv) || (values.size() != 3)) - return; - - static Vector ethiopic_amete_alem { "ethiopic"sv, "amete"sv, "alem"sv }; - - if (values == ethiopic_amete_alem) - values = { "ethioaa"sv }; -} - static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) { auto canonicalize_language = [](LanguageID& language_id, bool force_lowercase) { @@ -589,39 +594,32 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) } }; - auto canonicalize_key_value_list = [&](auto& key, auto& values, bool remove_true_values) { + auto canonicalize_key_value_list = [&](auto& key, auto& value, bool remove_true_values) { key = key.to_lowercase(); + value = value.to_lowercase(); - auto raw_values = move(values); + perform_hard_coded_key_value_substitutions(key, value); - for (auto& value : raw_values) { - value = value.to_lowercase(); - perform_hard_coded_key_value_substitutions(key, value); - - // Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec - // because, for tvalues, that would result in invalid syntax: - // https://unicode-org.atlassian.net/browse/CLDR-14318 - // This has also been noted by test262: - // https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js - if (remove_true_values && (value == "true"sv)) - continue; - - if (key.is_one_of("sd"sv, "rg"sv)) { - if (auto alias = resolve_subdivision_alias(value); alias.has_value()) { - auto aliases = alias->split_view(' '); - - // FIXME: Subdivision subtags do not appear in the CLDR likelySubtags.json file. - // Implement the spec's recommendation of using just the first alias for now, - // but we should determine if there's anything else needed here. - values.append(aliases[0].to_string()); - continue; - } - } - - values.append(move(value)); + // Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec + // because, for tvalues, that would result in invalid syntax: + // https://unicode-org.atlassian.net/browse/CLDR-14318 + // This has also been noted by test262: + // https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js + if (remove_true_values && (value == "true"sv)) { + value = {}; + return; } - perform_hard_coded_key_multi_value_substitutions(key, values); + if (key.is_one_of("sd"sv, "rg"sv)) { + if (auto alias = resolve_subdivision_alias(value); alias.has_value()) { + auto aliases = alias->split_view(' '); + + // FIXME: Subdivision subtags do not appear in the CLDR likelySubtags.json file. + // Implement the spec's recommendation of using just the first alias for now, + // but we should determine if there's anything else needed here. + value = aliases[0].to_string(); + } + } }; canonicalize_language(locale_id.language_id, false); @@ -643,7 +641,7 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) for (auto& attribute : ext.attributes) attribute = attribute.to_lowercase(); for (auto& keyword : ext.keywords) - canonicalize_key_value_list(keyword.key, keyword.types, true); + canonicalize_key_value_list(keyword.key, keyword.value, true); quick_sort(ext.attributes); quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; }); @@ -653,14 +651,13 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) canonicalize_language(*ext.language, true); for (auto& field : ext.fields) - canonicalize_key_value_list(field.key, field.values, false); + canonicalize_key_value_list(field.key, field.value, false); quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; }); }, [&](OtherExtension& ext) { ext.key = static_cast(to_ascii_lowercase(ext.key)); - for (auto& value : ext.values) - value = value.to_lowercase(); + ext.value = ext.value.to_lowercase(); }); } @@ -674,7 +671,7 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) StringBuilder builder; auto append_sep_and_string = [&](Optional const& string) { - if (!string.has_value()) + if (!string.has_value() || string->is_empty()) return; builder.appendff("-{}", *string); }; @@ -690,13 +687,6 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) for (auto const& variant : locale_id.language_id.variants) append_sep_and_string(variant); - auto append_key_value_list = [&](auto const& key, auto& values) { - append_sep_and_string(key); - - for (auto& value : values) - append_sep_and_string(value); - }; - for (auto const& extension : locale_id.extensions) { extension.visit( [&](LocaleExtension const& ext) { @@ -704,8 +694,10 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) for (auto const& attribute : ext.attributes) append_sep_and_string(attribute); - for (auto const& keyword : ext.keywords) - append_key_value_list(keyword.key, keyword.types); + for (auto const& keyword : ext.keywords) { + append_sep_and_string(keyword.key); + append_sep_and_string(keyword.value); + } }, [&](TransformedExtension const& ext) { builder.append("-t"sv); @@ -718,13 +710,14 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) append_sep_and_string(variant); } - for (auto const& field : ext.fields) - append_key_value_list(field.key, field.values); + for (auto const& field : ext.fields) { + append_sep_and_string(field.key); + append_sep_and_string(field.value); + } }, [&](OtherExtension const& ext) { builder.appendff("-{:c}", to_ascii_lowercase(ext.key)); - for (auto const& value : ext.values) - append_sep_and_string(value); + append_sep_and_string(ext.value); }); } @@ -947,19 +940,13 @@ String LocaleID::to_string() const StringBuilder builder; auto append_segment = [&](Optional const& segment) { - if (!segment.has_value()) + if (!segment.has_value() || segment->is_empty()) return; if (!builder.is_empty()) builder.append('-'); builder.append(*segment); }; - auto append_key_value_list = [&](auto const& key, auto const& values) { - append_segment(key); - for (auto const& value : values) - append_segment(value); - }; - append_segment(language_id.to_string()); for (auto const& extension : extensions) { @@ -968,20 +955,23 @@ String LocaleID::to_string() const builder.append("-u"sv); for (auto const& attribute : ext.attributes) append_segment(attribute); - for (auto const& keyword : ext.keywords) - append_key_value_list(keyword.key, keyword.types); + for (auto const& keyword : ext.keywords) { + append_segment(keyword.key); + append_segment(keyword.value); + } }, [&](TransformedExtension const& ext) { builder.append("-t"sv); if (ext.language.has_value()) append_segment(ext.language->to_string()); - for (auto const& field : ext.fields) - append_key_value_list(field.key, field.values); + for (auto const& field : ext.fields) { + append_segment(field.key); + append_segment(field.value); + } }, [&](OtherExtension const& ext) { builder.appendff("-{}", ext.key); - for (auto const& value : ext.values) - append_segment(value); + append_segment(ext.value); }); } diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index 428d84714b3..3f28d88227d 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -29,7 +29,7 @@ struct LanguageID { struct Keyword { String key {}; - Vector types {}; + String value {}; }; struct LocaleExtension { @@ -38,8 +38,8 @@ struct LocaleExtension { }; struct TransformedField { - String key; - Vector values {}; + String key {}; + String value {}; }; struct TransformedExtension { @@ -49,7 +49,7 @@ struct TransformedExtension { struct OtherExtension { char key {}; - Vector values {}; + String value {}; }; using Extension = Variant;