Selaa lähdekoodia

LibUnicode: Parse locale extensions of the other extension form

Timothy Flynn 3 vuotta sitten
vanhempi
commit
29f76ef7c8

+ 34 - 0
Tests/LibUnicode/TestUnicodeLocale.cpp

@@ -212,6 +212,40 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension)
     pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } });
 }
 
+TEST_CASE(parse_unicode_locale_id_with_other_extension)
+{
+    auto fail = [](StringView locale) {
+        auto locale_id = Unicode::parse_unicode_locale_id(locale);
+        EXPECT(!locale_id.has_value());
+    };
+    auto pass = [](StringView locale, Unicode::OtherExtension const& expected_extension) {
+        auto locale_id = Unicode::parse_unicode_locale_id(locale);
+        VERIFY(locale_id.has_value());
+        EXPECT_EQ(locale_id->extensions.size(), 1u);
+
+        auto const& actual_extension = locale_id->extensions[0].get<Unicode::OtherExtension>();
+        EXPECT_EQ(actual_extension.key, expected_extension.key);
+        EXPECT_EQ(actual_extension.values, expected_extension.values);
+    };
+
+    fail("en-z"sv);
+    fail("en-0"sv);
+    fail("en-z-"sv);
+    fail("en-0-"sv);
+    fail("en-z-a"sv);
+    fail("en-0-a"sv);
+    fail("en-z-aaaaaaaaa"sv);
+    fail("en-0-aaaaaaaaa"sv);
+    fail("en-z-aaa-"sv);
+    fail("en-0-aaa-"sv);
+    fail("en-z-aaa-a"sv);
+    fail("en-0-aaa-a"sv);
+
+    pass("en-z-aa", { 'z', { "aa"sv } });
+    pass("en-z-aa-bbb", { 'z', { "aa"sv, "bbb"sv } });
+    pass("en-z-aa-bbb-cccccccc", { 'z', { "aa"sv, "bbb"sv, "cccccccc"sv } });
+}
+
 TEST_CASE(canonicalize_unicode_locale_id)
 {
     auto test = [](StringView locale, StringView expected_canonical_locale) {

+ 31 - 2
Userland/Libraries/LibUnicode/Locale.cpp

@@ -340,6 +340,34 @@ static Optional<TransformedExtension> parse_transformed_extension(GenericLexer&
     return transformed_extension;
 }
 
+static Optional<OtherExtension> parse_other_extension(char key, GenericLexer& lexer)
+{
+    // https://unicode.org/reports/tr35/#other_extensions
+    //
+    // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
+    OtherExtension other_extension { .key = key };
+
+    if (!is_ascii_alphanumeric(key) || (key == 'x') || (key == 'X'))
+        return {};
+
+    while (true) {
+        auto segment = consume_next_segment(lexer);
+        if (!segment.has_value())
+            break;
+
+        if ((segment->length() < 2) || (segment->length() > 8) || !all_of(*segment, is_ascii_alphanumeric)) {
+            lexer.retreat(segment->length() + 1);
+            break;
+        }
+
+        other_extension.values.append(*segment);
+    }
+
+    if (other_extension.values.is_empty())
+        return {};
+    return other_extension;
+}
+
 static Optional<Extension> parse_extension(GenericLexer& lexer)
 {
     // https://unicode.org/reports/tr35/#extensions
@@ -348,7 +376,7 @@ static Optional<Extension> parse_extension(GenericLexer& lexer)
     size_t starting_position = lexer.tell();
 
     if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) {
-        switch ((*header)[0]) {
+        switch (char key = (*header)[0]) {
         case 'u':
         case 'U':
             if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value())
@@ -362,7 +390,8 @@ static Optional<Extension> parse_extension(GenericLexer& lexer)
             break;
 
         default:
-            // FIXME: Handle other_extensions
+            if (auto extension = parse_other_extension(key, lexer); extension.has_value())
+                return Extension { extension.release_value() };
             break;
         }
     }

+ 6 - 1
Userland/Libraries/LibUnicode/Locale.h

@@ -43,7 +43,12 @@ struct TransformedExtension {
     Vector<TransformedField> fields {};
 };
 
-using Extension = Variant<LocaleExtension, TransformedExtension>;
+struct OtherExtension {
+    char key {};
+    Vector<StringView> values {};
+};
+
+using Extension = Variant<LocaleExtension, TransformedExtension, OtherExtension>;
 
 struct LocaleID {
     LanguageID language_id {};