From 4f2bcebe749a57f1367a9bd7d58b249a48981e23 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 8 Sep 2021 15:25:35 -0400 Subject: [PATCH] LibUnicode+LibJS: Store locale keyword values as a single string Previously, LibUnicode would store the values of a keyword as a Vector. For example, the locale "en-u-ca-abc-def" would have its keyword "ca" stored as {"abc, "def"}. Then, canonicalization would occur on each of the elements in that Vector. This is incorrect because, for example, the keyword value "true" should only be dropped if that is the entire value. That is, the canonical form of "en-u-kb-true" is "en-u-kb", but "en-u-kb-abc-true" does not change for canonicalization. However, we would canonicalize that locale as "en-u-kb-abc". --- Tests/LibUnicode/TestUnicodeLocale.cpp | 34 ++-- .../Libraries/LibJS/Runtime/Intl/Locale.cpp | 19 +-- .../LibJS/Runtime/Intl/LocaleConstructor.cpp | 9 +- Userland/Libraries/LibUnicode/Locale.cpp | 150 ++++++++---------- Userland/Libraries/LibUnicode/Locale.h | 8 +- 5 files changed, 102 insertions(+), 118 deletions(-) diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index 53ffddbf2cc..a82b9eae2b8 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -138,7 +138,7 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension) auto const& expected_keyword = expected_extension.keywords[i]; EXPECT_EQ(actual_keyword.key, expected_keyword.key); - EXPECT_EQ(actual_keyword.types, expected_keyword.types); + EXPECT_EQ(actual_keyword.value, expected_keyword.value); } }; @@ -153,15 +153,15 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension) fail("en-u-xxxxx-"sv); fail("en-u-xxxxxxxxx"sv); - pass("en-u-xx"sv, { {}, { { "xx"sv, {} } } }); + pass("en-u-xx"sv, { {}, { { "xx"sv, ""sv } } }); pass("en-u-xx-yyyy"sv, { {}, { { "xx"sv, { "yyyy"sv } } } }); - pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } } } }); - pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } }, { "aa"sv, {} } } }); + pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, "yyyy-zzzz"sv } } }); + pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, "yyyy-zzzz"sv }, { "aa"sv, ""sv } } }); pass("en-u-xxx"sv, { { "xxx"sv }, {} }); pass("en-u-fff-gggg"sv, { { "fff"sv, "gggg"sv }, {} }); - pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, {} } } }); - pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, { "yyyy"sv } } } }); - pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } }); + pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, ""sv } } }); + pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, "yyyy"sv } } }); + pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, "yyyy"sv } } }); } TEST_CASE(parse_unicode_locale_id_with_transformed_extension) @@ -192,7 +192,7 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension) auto const& expected_field = expected_extension.fields[i]; EXPECT_EQ(actual_field.key, expected_field.key); - EXPECT_EQ(actual_field.values, expected_field.values); + EXPECT_EQ(actual_field.value, expected_field.value); } }; @@ -225,9 +225,9 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension) pass("en-t-en-us-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv, { "posix"sv } }, {} }); pass("en-t-en-latn-us-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv, { "posix"sv } }, {} }); pass("en-t-k0-aaa"sv, { {}, { { "k0"sv, { "aaa"sv } } } }); - pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv, "bbbb" } } } }); - pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, { "bbbb"sv } } } }); - pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } }); + pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, "aaa-bbbb"sv } } }); + pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, "bbbb"sv } } }); + pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, "aaa"sv } } }); } TEST_CASE(parse_unicode_locale_id_with_other_extension) @@ -243,7 +243,7 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension) auto const& actual_extension = locale_id->extensions[0].get(); EXPECT_EQ(actual_extension.key, expected_extension.key); - EXPECT_EQ(actual_extension.values, expected_extension.values); + EXPECT_EQ(actual_extension.value, expected_extension.value); }; fail("en-z"sv); @@ -259,9 +259,9 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension) fail("en-z-aaa-a"sv); fail("en-0-aaa-a"sv); - pass("en-z-aa", { 'z', { "aa"sv } }); - pass("en-z-aa-bbb", { 'z', { "aa"sv, "bbb"sv } }); - pass("en-z-aa-bbb-cccccccc", { 'z', { "aa"sv, "bbb"sv, "cccccccc"sv } }); + pass("en-z-aa", { 'z', "aa"sv }); + pass("en-z-aa-bbb", { 'z', "aa-bbb"sv }); + pass("en-z-aa-bbb-cccccccc", { 'z', "aa-bbb-cccccccc"sv }); } TEST_CASE(parse_unicode_locale_id_with_private_use_extension) @@ -320,8 +320,12 @@ TEST_CASE(canonicalize_unicode_locale_id) test("EN-U-CCC-BBB-2K-AAA-1K-BBB"sv, "en-u-bbb-ccc-1k-bbb-2k-aaa"sv); test("en-u-1k-true"sv, "en-u-1k"sv); test("EN-U-1K-TRUE"sv, "en-u-1k"sv); + test("en-u-1k-true-abcd"sv, "en-u-1k-true-abcd"sv); + test("EN-U-1K-TRUE-ABCD"sv, "en-u-1k-true-abcd"sv); test("en-u-kb-yes"sv, "en-u-kb"sv); test("EN-U-KB-YES"sv, "en-u-kb"sv); + test("en-u-kb-yes-abcd"sv, "en-u-kb-yes-abcd"sv); + test("EN-U-KB-YES-ABCD"sv, "en-u-kb-yes-abcd"sv); test("en-u-ka-yes"sv, "en-u-ka-yes"sv); test("EN-U-KA-YES"sv, "en-u-ka-yes"sv); test("en-u-1k-names"sv, "en-u-1k-names"sv); diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp b/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp index 4136c8419ad..23c7be195d8 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/Locale.cpp @@ -4,7 +4,6 @@ * SPDX-License-Identifier: BSD-2-Clause */ -#include #include #include #include @@ -27,29 +26,23 @@ Locale::Locale(Unicode::LocaleID const& locale_id, Object& prototype) { set_locale(locale_id.to_string()); - auto join_keyword_types = [](auto const& types) { - StringBuilder builder; - builder.join('-', types); - return builder.build(); - }; - for (auto const& extension : locale_id.extensions) { if (!extension.has()) continue; for (auto const& keyword : extension.get().keywords) { if (keyword.key == "ca"sv) { - set_calendar(join_keyword_types(keyword.types)); + set_calendar(keyword.value); } else if (keyword.key == "co"sv) { - set_collation(join_keyword_types(keyword.types)); + set_collation(keyword.value); } else if (keyword.key == "hc"sv) { - set_hour_cycle(join_keyword_types(keyword.types)); + set_hour_cycle(keyword.value); } else if (keyword.key == "kf"sv) { - set_case_first(join_keyword_types(keyword.types)); + set_case_first(keyword.value); } else if (keyword.key == "kn"sv) { - set_numeric(keyword.types.is_empty()); + set_numeric(keyword.value.is_empty()); } else if (keyword.key == "nu"sv) { - set_numbering_system(join_keyword_types(keyword.types)); + set_numbering_system(keyword.value); } } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp b/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp index 3ee98bb1b64..51ddb8eed5f 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -193,9 +192,7 @@ static LocaleAndKeys apply_unicode_extension_to_tag(StringView tag, LocaleAndKey entry = &(*it); // ii. Let value be entry.[[Value]]. - StringBuilder builder; - builder.join('-', entry->types); - value = builder.build(); + value = entry->value; } // c. Else, // i. Let entry be empty. @@ -213,12 +210,12 @@ static LocaleAndKeys apply_unicode_extension_to_tag(StringView tag, LocaleAndKey // iii. If entry is not empty, then if (entry != nullptr) { // 1. Set entry.[[Value]] to value. - entry->types = value->split('-'); + entry->value = *value; } // iv. Else, else { // 1. Append the Record { [[Key]]: key, [[Value]]: value } to keywords. - keywords.append({ key, value->split('-') }); + keywords.append({ key, *value }); } } diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index 11263eb2a72..55c89700b0f 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -211,6 +211,7 @@ static Optional parse_unicode_locale_extension(GenericLexer& le case ParseState::ParsingKeyword: { // keyword = key (sep type)? Keyword keyword { .key = *segment }; + Vector keyword_values; if (!is_key(*segment)) { lexer.retreat(segment->length() + 1); @@ -227,9 +228,13 @@ static Optional parse_unicode_locale_extension(GenericLexer& le break; } - keyword.types.append(*type); + keyword_values.append(*type); } + StringBuilder builder; + builder.join('-', keyword_values); + keyword.value = builder.build(); + locale_extension.keywords.append(move(keyword)); break; } @@ -283,6 +288,7 @@ static Optional parse_transformed_extension(GenericLexer& case ParseState::ParsingField: { // tfield = tkey tvalue; TransformedField field { .key = *segment }; + Vector field_values; if (!is_transformed_key(*segment)) { lexer.retreat(segment->length() + 1); @@ -299,12 +305,16 @@ static Optional parse_transformed_extension(GenericLexer& break; } - field.values.append(*value); + field_values.append(*value); } - if (field.values.is_empty()) + if (field_values.is_empty()) return {}; + StringBuilder builder; + builder.join('-', field_values); + field.value = builder.build(); + transformed_extension.fields.append(move(field)); break; } @@ -325,6 +335,7 @@ static Optional parse_other_extension(char key, GenericLexer& le // // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; OtherExtension other_extension { .key = key }; + Vector other_values; if (!is_ascii_alphanumeric(key) || (key == 'x') || (key == 'X')) return {}; @@ -339,11 +350,16 @@ static Optional parse_other_extension(char key, GenericLexer& le break; } - other_extension.values.append(*segment); + other_values.append(*segment); } - if (other_extension.values.is_empty()) + if (other_values.is_empty()) return {}; + + StringBuilder builder; + builder.join('-', other_values); + other_extension.value = builder.build(); + return other_extension; } @@ -469,8 +485,11 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/transform.xml // // There isn't yet a counterpart in the JSON export. See: https://unicode-org.atlassian.net/browse/CLDR-14571 - if ((key == "ca"sv) && (value == "islamicc"sv)) { - value = "islamic-civil"sv; + if (key == "ca"sv) { + if (value == "islamicc"sv) + value = "islamic-civil"sv; + else if (value == "ethiopic-amete-alem"sv) + value = "ethioaa"sv; } else if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) { value = "true"sv; } else if (key == "ks"sv) { @@ -521,20 +540,6 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu } } -static void perform_hard_coded_key_multi_value_substitutions(String const& key, Vector& values) -{ - // Similar to perform_hard_coded_key_value_substitutions, some aliases depend on multiple - // variants being present in the original locale. Those are canonicalized separately here. - // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml - if ((key != "ca"sv) || (values.size() != 3)) - return; - - static Vector ethiopic_amete_alem { "ethiopic"sv, "amete"sv, "alem"sv }; - - if (values == ethiopic_amete_alem) - values = { "ethioaa"sv }; -} - static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) { auto canonicalize_language = [](LanguageID& language_id, bool force_lowercase) { @@ -589,39 +594,32 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) } }; - auto canonicalize_key_value_list = [&](auto& key, auto& values, bool remove_true_values) { + auto canonicalize_key_value_list = [&](auto& key, auto& value, bool remove_true_values) { key = key.to_lowercase(); + value = value.to_lowercase(); - auto raw_values = move(values); + perform_hard_coded_key_value_substitutions(key, value); - for (auto& value : raw_values) { - value = value.to_lowercase(); - perform_hard_coded_key_value_substitutions(key, value); - - // Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec - // because, for tvalues, that would result in invalid syntax: - // https://unicode-org.atlassian.net/browse/CLDR-14318 - // This has also been noted by test262: - // https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js - if (remove_true_values && (value == "true"sv)) - continue; - - if (key.is_one_of("sd"sv, "rg"sv)) { - if (auto alias = resolve_subdivision_alias(value); alias.has_value()) { - auto aliases = alias->split_view(' '); - - // FIXME: Subdivision subtags do not appear in the CLDR likelySubtags.json file. - // Implement the spec's recommendation of using just the first alias for now, - // but we should determine if there's anything else needed here. - values.append(aliases[0].to_string()); - continue; - } - } - - values.append(move(value)); + // Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec + // because, for tvalues, that would result in invalid syntax: + // https://unicode-org.atlassian.net/browse/CLDR-14318 + // This has also been noted by test262: + // https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js + if (remove_true_values && (value == "true"sv)) { + value = {}; + return; } - perform_hard_coded_key_multi_value_substitutions(key, values); + if (key.is_one_of("sd"sv, "rg"sv)) { + if (auto alias = resolve_subdivision_alias(value); alias.has_value()) { + auto aliases = alias->split_view(' '); + + // FIXME: Subdivision subtags do not appear in the CLDR likelySubtags.json file. + // Implement the spec's recommendation of using just the first alias for now, + // but we should determine if there's anything else needed here. + value = aliases[0].to_string(); + } + } }; canonicalize_language(locale_id.language_id, false); @@ -643,7 +641,7 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) for (auto& attribute : ext.attributes) attribute = attribute.to_lowercase(); for (auto& keyword : ext.keywords) - canonicalize_key_value_list(keyword.key, keyword.types, true); + canonicalize_key_value_list(keyword.key, keyword.value, true); quick_sort(ext.attributes); quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; }); @@ -653,14 +651,13 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) canonicalize_language(*ext.language, true); for (auto& field : ext.fields) - canonicalize_key_value_list(field.key, field.values, false); + canonicalize_key_value_list(field.key, field.value, false); quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; }); }, [&](OtherExtension& ext) { ext.key = static_cast(to_ascii_lowercase(ext.key)); - for (auto& value : ext.values) - value = value.to_lowercase(); + ext.value = ext.value.to_lowercase(); }); } @@ -674,7 +671,7 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) StringBuilder builder; auto append_sep_and_string = [&](Optional const& string) { - if (!string.has_value()) + if (!string.has_value() || string->is_empty()) return; builder.appendff("-{}", *string); }; @@ -690,13 +687,6 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) for (auto const& variant : locale_id.language_id.variants) append_sep_and_string(variant); - auto append_key_value_list = [&](auto const& key, auto& values) { - append_sep_and_string(key); - - for (auto& value : values) - append_sep_and_string(value); - }; - for (auto const& extension : locale_id.extensions) { extension.visit( [&](LocaleExtension const& ext) { @@ -704,8 +694,10 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) for (auto const& attribute : ext.attributes) append_sep_and_string(attribute); - for (auto const& keyword : ext.keywords) - append_key_value_list(keyword.key, keyword.types); + for (auto const& keyword : ext.keywords) { + append_sep_and_string(keyword.key); + append_sep_and_string(keyword.value); + } }, [&](TransformedExtension const& ext) { builder.append("-t"sv); @@ -718,13 +710,14 @@ Optional canonicalize_unicode_locale_id(LocaleID& locale_id) append_sep_and_string(variant); } - for (auto const& field : ext.fields) - append_key_value_list(field.key, field.values); + for (auto const& field : ext.fields) { + append_sep_and_string(field.key); + append_sep_and_string(field.value); + } }, [&](OtherExtension const& ext) { builder.appendff("-{:c}", to_ascii_lowercase(ext.key)); - for (auto const& value : ext.values) - append_sep_and_string(value); + append_sep_and_string(ext.value); }); } @@ -947,19 +940,13 @@ String LocaleID::to_string() const StringBuilder builder; auto append_segment = [&](Optional const& segment) { - if (!segment.has_value()) + if (!segment.has_value() || segment->is_empty()) return; if (!builder.is_empty()) builder.append('-'); builder.append(*segment); }; - auto append_key_value_list = [&](auto const& key, auto const& values) { - append_segment(key); - for (auto const& value : values) - append_segment(value); - }; - append_segment(language_id.to_string()); for (auto const& extension : extensions) { @@ -968,20 +955,23 @@ String LocaleID::to_string() const builder.append("-u"sv); for (auto const& attribute : ext.attributes) append_segment(attribute); - for (auto const& keyword : ext.keywords) - append_key_value_list(keyword.key, keyword.types); + for (auto const& keyword : ext.keywords) { + append_segment(keyword.key); + append_segment(keyword.value); + } }, [&](TransformedExtension const& ext) { builder.append("-t"sv); if (ext.language.has_value()) append_segment(ext.language->to_string()); - for (auto const& field : ext.fields) - append_key_value_list(field.key, field.values); + for (auto const& field : ext.fields) { + append_segment(field.key); + append_segment(field.value); + } }, [&](OtherExtension const& ext) { builder.appendff("-{}", ext.key); - for (auto const& value : ext.values) - append_segment(value); + append_segment(ext.value); }); } diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index 428d84714b3..3f28d88227d 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -29,7 +29,7 @@ struct LanguageID { struct Keyword { String key {}; - Vector types {}; + String value {}; }; struct LocaleExtension { @@ -38,8 +38,8 @@ struct LocaleExtension { }; struct TransformedField { - String key; - Vector values {}; + String key {}; + String value {}; }; struct TransformedExtension { @@ -49,7 +49,7 @@ struct TransformedExtension { struct OtherExtension { char key {}; - Vector values {}; + String value {}; }; using Extension = Variant;