Browse Source

LibUnicode: Parse locale private use extensions

Timothy Flynn 3 years ago
parent
commit
30855e6663

+ 24 - 0
Tests/LibUnicode/TestUnicodeLocale.cpp

@@ -246,6 +246,30 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension)
     pass("en-z-aa-bbb-cccccccc", { 'z', { "aa"sv, "bbb"sv, "cccccccc"sv } });
 }
 
+TEST_CASE(parse_unicode_locale_id_with_private_use_extension)
+{
+    auto fail = [](StringView locale) {
+        auto locale_id = Unicode::parse_unicode_locale_id(locale);
+        EXPECT(!locale_id.has_value());
+    };
+    auto pass = [](StringView locale, Vector<StringView> const& expected_extension) {
+        auto locale_id = Unicode::parse_unicode_locale_id(locale);
+        VERIFY(locale_id.has_value());
+        EXPECT_EQ(locale_id->private_use_extensions, expected_extension);
+    };
+
+    fail("en-x"sv);
+    fail("en-x-"sv);
+    fail("en-x-aaaaaaaaa"sv);
+    fail("en-x-aaa-"sv);
+    fail("en-x-aaa-aaaaaaaaa"sv);
+
+    pass("en-x-a", { "a"sv });
+    pass("en-x-aaaaaaaa", { "aaaaaaaa"sv });
+    pass("en-x-aaa-bbb", { "aaa"sv, "bbb"sv });
+    pass("en-x-aaa-x-bbb", { "aaa"sv, "x"sv, "bbb"sv });
+}
+
 TEST_CASE(canonicalize_unicode_locale_id)
 {
     auto test = [](StringView locale, StringView expected_canonical_locale) {

+ 40 - 1
Userland/Libraries/LibUnicode/Locale.cpp

@@ -400,6 +400,45 @@ static Optional<Extension> parse_extension(GenericLexer& lexer)
     return {};
 }
 
+static Vector<StringView> parse_private_use_extensions(GenericLexer& lexer)
+{
+    // https://unicode.org/reports/tr35/#pu_extensions
+    //
+    // pu_extensions = = sep [xX] (sep alphanum{1,8})+ ;
+    size_t starting_position = lexer.tell();
+
+    auto header = consume_next_segment(lexer);
+    if (!header.has_value())
+        return {};
+
+    auto parse_values = [&]() -> Vector<StringView> {
+        Vector<StringView> extensions;
+
+        while (true) {
+            auto segment = consume_next_segment(lexer);
+            if (!segment.has_value())
+                break;
+
+            if ((segment->length() < 1) || (segment->length() > 8) || !all_of(*segment, is_ascii_alphanumeric)) {
+                lexer.retreat(segment->length() + 1);
+                break;
+            }
+
+            extensions.append(*segment);
+        }
+
+        return extensions;
+    };
+
+    if ((header->length() == 1) && (((*header)[0] == 'x') || ((*header)[0] == 'X'))) {
+        if (auto extensions = parse_values(); !extensions.is_empty())
+            return extensions;
+    }
+
+    lexer.retreat(lexer.tell() - starting_position);
+    return {};
+}
+
 Optional<LanguageID> parse_unicode_language_id(StringView language)
 {
     GenericLexer lexer { language };
@@ -433,7 +472,7 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
         locale_id.extensions.append(extension.release_value());
     }
 
-    // FIXME: Handle pu_extensions.
+    locale_id.private_use_extensions = parse_private_use_extensions(lexer);
 
     if (!lexer.is_eof())
         return {};

+ 1 - 0
Userland/Libraries/LibUnicode/Locale.h

@@ -53,6 +53,7 @@ using Extension = Variant<LocaleExtension, TransformedExtension, OtherExtension>
 struct LocaleID {
     LanguageID language_id {};
     Vector<Extension> extensions {};
+    Vector<StringView> private_use_extensions {};
 };
 
 // Note: These methods only verify that the provided strings match the EBNF grammar of the