Sfoglia il codice sorgente

LibUnicode: Canonicalize calendar subtags

Calendar subtags are a bit of an odd-man-out in that we must match the
variants "ethiopic-amete-alem" in that order, without any other variant
in the locale. So a separate method is needed for this, and we now defer
sorting the variant list until after other canonicalization is done.
Timothy Flynn 3 anni fa
parent
commit
da89cf9afb

+ 10 - 0
Tests/LibUnicode/TestUnicodeLocale.cpp

@@ -320,6 +320,14 @@ TEST_CASE(canonicalize_unicode_locale_id)
     test("EN-U-TZ-HONGKONG"sv, "en-u-tz-hkhkg"sv);
     test("en-u-ta-hongkong"sv, "en-u-ta-hongkong"sv);
     test("EN-U-TA-HONGKONG"sv, "en-u-ta-hongkong"sv);
+    test("en-u-ca-ethiopic-amete-alem"sv, "en-u-ca-ethioaa"sv);
+    test("EN-U-CA-ETHIOPIC-AMETE-ALEM"sv, "en-u-ca-ethioaa"sv);
+    test("en-u-ca-alem-ethiopic-amete"sv, "en-u-ca-alem-ethiopic-amete"sv);
+    test("EN-U-CA-ALEM-ETHIOPIC-AMETE"sv, "en-u-ca-alem-ethiopic-amete"sv);
+    test("en-u-ca-ethiopic-amete-xxx-alem"sv, "en-u-ca-ethiopic-amete-xxx-alem"sv);
+    test("EN-U-CA-ETHIOPIC-AMETE-XXX-ALEM"sv, "en-u-ca-ethiopic-amete-xxx-alem"sv);
+    test("en-u-cb-ethiopic-amete-alem"sv, "en-u-cb-ethiopic-amete-alem"sv);
+    test("EN-U-CB-ETHIOPIC-AMETE-ALEM"sv, "en-u-cb-ethiopic-amete-alem"sv);
 
     test("en-t-en"sv, "en-t-en"sv);
     test("EN-T-EN"sv, "en-t-en"sv);
@@ -345,6 +353,8 @@ TEST_CASE(canonicalize_unicode_locale_id)
     test("EN-T-K1-IMPERIAL"sv, "en-t-k1-imperial"sv);
     test("en-t-k1-hongkong"sv, "en-t-k1-hongkong"sv);
     test("EN-T-K1-HONGKONG"sv, "en-t-k1-hongkong"sv);
+    test("en-t-k1-ethiopic-amete-alem"sv, "en-t-k1-ethiopic-amete-alem"sv);
+    test("EN-T-K1-ETHIOPIC-AMETE-ALEM"sv, "en-t-k1-ethiopic-amete-alem"sv);
 
     test("en-0-aaa"sv, "en-0-aaa"sv);
     test("EN-0-AAA"sv, "en-0-aaa"sv);

+ 25 - 6
Userland/Libraries/LibUnicode/Locale.cpp

@@ -483,6 +483,7 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
 static void perform_hard_coded_key_value_substitutions(String& key, String& value)
 {
     // FIXME: In the XML export of CLDR, there are some aliases defined in the following files:
+    // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml
     // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/collation.xml
     // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/measure.xml
     // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/timezone.xml
@@ -490,7 +491,9 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu
     //
     // There doesn't seem to be a counterpart in the JSON export. Since there aren't many such
     // aliases, until an XML parser is implemented, those aliases are implemented here.
-    if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) {
+    if ((key == "ca"sv) && (value == "islamicc"sv)) {
+        value = "islamic-civil"sv;
+    } else if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) {
         value = "true"sv;
     } else if (key == "ks"sv) {
         if (value == "primary"sv)
@@ -540,6 +543,20 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu
     }
 }
 
+static void perform_hard_coded_key_multi_value_substitutions(String const& key, Vector<String>& values)
+{
+    // Similar to perform_hard_coded_key_value_substitutions, some aliases depend on multiple
+    // variants being present in the original locale. Those are canonicalized separately here.
+    // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml
+    if ((key != "ca"sv) || (values.size() != 3))
+        return;
+
+    static Vector<String> ethiopic_amete_alem { "ethiopic"sv, "amete"sv, "alem"sv };
+
+    if (values == ethiopic_amete_alem)
+        values = { "ethioaa"sv };
+}
+
 static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
 {
     auto canonicalize_language = [](LanguageID& language_id, bool force_lowercase) {
@@ -626,6 +643,8 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
 
             values.append(move(value));
         }
+
+        perform_hard_coded_key_multi_value_substitutions(key, values);
     };
 
     canonicalize_language(locale_id.language_id, false);
@@ -644,22 +663,22 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
     for (auto& extension : locale_id.extensions) {
         extension.visit(
             [&](LocaleExtension& ext) {
-                quick_sort(ext.attributes);
-                quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
-
                 for (auto& attribute : ext.attributes)
                     attribute = attribute.to_lowercase();
                 for (auto& keyword : ext.keywords)
                     canonicalize_key_value_list(keyword.key, keyword.types, true);
+
+                quick_sort(ext.attributes);
+                quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
             },
             [&](TransformedExtension& ext) {
                 if (ext.language.has_value())
                     canonicalize_language(*ext.language, true);
 
-                quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
-
                 for (auto& field : ext.fields)
                     canonicalize_key_value_list(field.key, field.values, false);
+
+                quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
             },
             [&](OtherExtension& ext) {
                 ext.key = static_cast<char>(to_ascii_lowercase(ext.key));