LibUnicode: Canonicalize locale extensions

This commit is contained in:
Timothy Flynn 2021-08-28 13:19:26 -04:00 committed by Linus Groh
parent 671eaa0c59
commit 6f0cb52dc4
Notes: sideshowbarker 2024-07-18 05:04:25 +09:00
2 changed files with 104 additions and 2 deletions

View file

@ -277,7 +277,7 @@ TEST_CASE(canonicalize_unicode_locale_id)
VERIFY(locale_id.has_value());
auto canonical_locale = Unicode::canonicalize_unicode_locale_id(*locale_id);
EXPECT_EQ(canonical_locale, expected_canonical_locale);
EXPECT_EQ(*canonical_locale, expected_canonical_locale);
};
test("aaa"sv, "aaa"sv);
@ -287,4 +287,44 @@ TEST_CASE(canonicalize_unicode_locale_id)
test("aaa-bBBB-cC"sv, "aaa-Bbbb-CC"sv);
test("aaa-bbbb-cc-1234"sv, "aaa-Bbbb-CC-1234"sv);
test("aaa-bbbb-cc-ABCDE"sv, "aaa-Bbbb-CC-abcde"sv);
test("en-u-aa"sv, "en-u-aa"sv);
test("EN-U-AA"sv, "en-u-aa"sv);
test("en-u-aa-bbb"sv, "en-u-aa-bbb"sv);
test("EN-U-AA-BBB"sv, "en-u-aa-bbb"sv);
test("en-u-aa-ccc-bbb"sv, "en-u-aa-ccc-bbb"sv);
test("EN-U-AA-CCC-BBB"sv, "en-u-aa-ccc-bbb"sv);
test("en-u-ddd-bbb-ccc"sv, "en-u-bbb-ccc-ddd"sv);
test("EN-U-DDD-BBB-CCC"sv, "en-u-bbb-ccc-ddd"sv);
test("en-u-2k-aaa-1k-bbb"sv, "en-u-1k-bbb-2k-aaa"sv);
test("EN-U-2K-AAA-1K-BBB"sv, "en-u-1k-bbb-2k-aaa"sv);
test("en-u-ccc-bbb-2k-aaa-1k-bbb"sv, "en-u-bbb-ccc-1k-bbb-2k-aaa"sv);
test("EN-U-CCC-BBB-2K-AAA-1K-BBB"sv, "en-u-bbb-ccc-1k-bbb-2k-aaa"sv);
test("en-u-1k-true"sv, "en-u-1k"sv);
test("EN-U-1K-TRUE"sv, "en-u-1k"sv);
test("en-t-en"sv, "en-t-en"sv);
test("EN-T-EN"sv, "en-t-en"sv);
test("en-latn-t-en-latn"sv, "en-Latn-t-en-latn"sv);
test("EN-LATN-T-EN-LATN"sv, "en-Latn-t-en-latn"sv);
test("en-us-t-en-us"sv, "en-US-t-en-us"sv);
test("EN-US-T-EN-US"sv, "en-US-t-en-us"sv);
test("en-latn-us-t-en-latn-us"sv, "en-Latn-US-t-en-latn-us"sv);
test("EN-LATN-US-T-EN-LATN-US"sv, "en-Latn-US-t-en-latn-us"sv);
test("en-t-en-k2-bbb-k1-aaa"sv, "en-t-en-k1-aaa-k2-bbb"sv);
test("EN-T-EN-K2-BBB-K1-AAA"sv, "en-t-en-k1-aaa-k2-bbb"sv);
test("en-t-k1-true"sv, "en-t-k1-true"sv);
test("EN-T-K1-TRUE"sv, "en-t-k1-true"sv);
test("en-0-aaa"sv, "en-0-aaa"sv);
test("EN-0-AAA"sv, "en-0-aaa"sv);
test("en-0-bbb-aaa"sv, "en-0-bbb-aaa"sv);
test("EN-0-BBB-AAA"sv, "en-0-bbb-aaa"sv);
test("en-z-bbb-0-aaa"sv, "en-0-aaa-z-bbb"sv);
test("EN-Z-BBB-0-AAA"sv, "en-0-aaa-z-bbb"sv);
test("en-u-aa-t-en"sv, "en-t-en-u-aa"sv);
test("EN-U-AA-T-EN"sv, "en-t-en-u-aa"sv);
test("en-z-bbb-u-aa-t-en-0-aaa"sv, "en-0-aaa-t-en-u-aa-z-bbb"sv);
test("EN-Z-BBB-U-AA-T-EN-0-AAA"sv, "en-0-aaa-t-en-u-aa-z-bbb"sv);
}

View file

@ -518,7 +518,69 @@ Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)
for (auto const& variant : locale_id.language_id.variants)
append_sep_and_string(variant);
// FIXME: Handle extensions and pu_extensions.
quick_sort(locale_id.extensions, [](auto const& left, auto const& right) {
auto key = [](auto const& extension) {
return extension.visit(
[](LocaleExtension const&) { return 'u'; },
[](TransformedExtension const&) { return 't'; },
[](OtherExtension const& ext) { return static_cast<char>(to_ascii_lowercase(ext.key)); });
};
return key(left) < key(right);
});
auto append_key_value_list = [&](auto const& key, auto const& values, bool remove_true_values) {
append_sep_and_string(key);
for (auto const& type : values) {
// Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec
// because, for tvalues, that would result in invalid syntax:
// https://unicode-org.atlassian.net/browse/CLDR-14318
// This has also been noted by test262:
// https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js
if (remove_true_values && type.equals_ignoring_case("true"sv))
continue;
append_sep_and_string(type);
}
};
for (auto& extension : locale_id.extensions) {
extension.visit(
[&](LocaleExtension& ext) {
quick_sort(ext.attributes);
quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
builder.append("-u"sv);
for (auto const& attribute : ext.attributes)
append_sep_and_string(attribute);
for (auto const& keyword : ext.keywords)
append_key_value_list(keyword.key, keyword.types, true);
},
[&](TransformedExtension& ext) {
quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
builder.append("-t"sv);
if (ext.language.has_value()) {
append_sep_and_string(ext.language->language);
append_sep_and_string(ext.language->script);
append_sep_and_string(ext.language->region);
quick_sort(ext.language->variants);
for (auto const& variant : ext.language->variants)
append_sep_and_string(variant);
}
for (auto const& field : ext.fields)
append_key_value_list(field.key, field.values, false);
},
[&](OtherExtension& ext) {
builder.appendff("-{:c}", to_ascii_lowercase(ext.key));
for (auto const& value : ext.values)
append_sep_and_string(value);
});
}
// FIXME: Handle pu_extensions.
return builder.build();
}