Browse Source

LibUnicode: Parse locale extensions of the Unicode locale extension form

Timothy Flynn 3 years ago
parent
commit
eda92d15e4

+ 46 - 0
Tests/LibUnicode/TestUnicodeLocale.cpp

@@ -100,6 +100,52 @@ TEST_CASE(parse_unicode_locale_id)
     pass("aaa-bbbb-cc-1234-5678"sv, "aaa"sv, "bbbb"sv, "cc"sv, { "1234"sv, "5678"sv });
 }
 
+TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension)
+{
+    auto fail = [](StringView locale) {
+        auto locale_id = Unicode::parse_unicode_locale_id(locale);
+        EXPECT(!locale_id.has_value());
+    };
+    auto pass = [](StringView locale, Unicode::LocaleExtension const& expected_extension) {
+        auto locale_id = Unicode::parse_unicode_locale_id(locale);
+        VERIFY(locale_id.has_value());
+        EXPECT_EQ(locale_id->extensions.size(), 1u);
+
+        auto const& actual_extension = locale_id->extensions[0].get<Unicode::LocaleExtension>();
+        VERIFY(actual_extension.attributes == expected_extension.attributes);
+        EXPECT_EQ(actual_extension.keywords.size(), expected_extension.keywords.size());
+
+        for (size_t i = 0; i < actual_extension.keywords.size(); ++i) {
+            auto const& actual_keyword = actual_extension.keywords[i];
+            auto const& expected_keyword = expected_extension.keywords[i];
+
+            EXPECT_EQ(actual_keyword.key, expected_keyword.key);
+            EXPECT_EQ(actual_keyword.types, expected_keyword.types);
+        }
+    };
+
+    fail("en-u"sv);
+    fail("en-u-"sv);
+    fail("en-u-x"sv);
+    fail("en-u-xx-"sv);
+    fail("en-u--xx"sv);
+    fail("en-u-xx-xxxxx-"sv);
+    fail("en-u-xx--xxxxx"sv);
+    fail("en-u-xx-xxxxxxxxx"sv);
+    fail("en-u-xxxxx-"sv);
+    fail("en-u-xxxxxxxxx"sv);
+
+    pass("en-u-xx"sv, { {}, { { "xx"sv, {} } } });
+    pass("en-u-xx-yyyy"sv, { {}, { { "xx"sv, { "yyyy"sv } } } });
+    pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } } } });
+    pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } }, { "aa"sv, {} } } });
+    pass("en-u-xxx"sv, { { "xxx"sv }, {} });
+    pass("en-u-fff-gggg"sv, { { "fff"sv, "gggg"sv }, {} });
+    pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, {} } } });
+    pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, { "yyyy"sv } } } });
+    pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } });
+}
+
 TEST_CASE(canonicalize_unicode_locale_id)
 {
     auto test = [](StringView locale, StringView expected_canonical_locale) {

+ 132 - 4
Userland/Libraries/LibUnicode/Locale.cpp

@@ -53,7 +53,32 @@ bool is_unicode_variant_subtag(StringView subtag)
     return false;
 }
 
-static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator)
+static bool is_key(StringView key)
+{
+    // key = alphanum alpha
+    if (key.length() != 2)
+        return false;
+    return is_ascii_alphanumeric(key[0]) && is_ascii_alpha(key[1]);
+}
+
+static bool is_single_type(StringView type)
+{
+    // type = alphanum{3,8} (sep alphanum{3,8})*
+    // Note: Consecutive types are not handled here, that is left to the caller.
+    if ((type.length() < 3) || (type.length() > 8))
+        return false;
+    return all_of(type, is_ascii_alphanumeric);
+}
+
+static bool is_attribute(StringView type)
+{
+    // attribute = alphanum{3,8}
+    if ((type.length() < 3) || (type.length() > 8))
+        return false;
+    return all_of(type, is_ascii_alphanumeric);
+}
+
+static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator = true)
 {
     constexpr auto is_separator = is_any_of("-_"sv);
 
@@ -153,6 +178,101 @@ static Optional<LanguageID> parse_unicode_language_id(GenericLexer& lexer)
     return language_id;
 }
 
+static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& lexer)
+{
+    // https://unicode.org/reports/tr35/#unicode_locale_extensions
+    //
+    // unicode_locale_extensions = sep [uU] ((sep keyword)+ | (sep attribute)+ (sep keyword)*)
+    LocaleExtension locale_extension {};
+
+    enum class ParseState {
+        ParsingAttributeOrKeyword,
+        ParsingAttribute,
+        ParsingKeyword,
+        Done,
+    };
+
+    auto state = ParseState::ParsingAttributeOrKeyword;
+
+    while (!lexer.is_eof() && (state != ParseState::Done)) {
+        auto segment = consume_next_segment(lexer);
+        if (!segment.has_value())
+            return {};
+
+        if (state == ParseState::ParsingAttributeOrKeyword)
+            state = is_key(*segment) ? ParseState::ParsingKeyword : ParseState::ParsingAttribute;
+
+        switch (state) {
+        case ParseState::ParsingAttribute:
+            if (is_attribute(*segment)) {
+                locale_extension.attributes.append(*segment);
+                break;
+            }
+
+            state = ParseState::ParsingKeyword;
+            [[fallthrough]];
+
+        case ParseState::ParsingKeyword: {
+            // keyword = key (sep type)?
+            Keyword keyword { .key = *segment };
+
+            if (!is_key(*segment)) {
+                lexer.retreat(segment->length() + 1);
+                state = ParseState::Done;
+                break;
+            }
+
+            while (true) {
+                auto type = consume_next_segment(lexer);
+
+                if (!type.has_value() || !is_single_type(*type)) {
+                    if (type.has_value())
+                        lexer.retreat(type->length() + 1);
+                    break;
+                }
+
+                keyword.types.append(*type);
+            }
+
+            locale_extension.keywords.append(move(keyword));
+            break;
+        }
+
+        default:
+            VERIFY_NOT_REACHED();
+        }
+    }
+
+    if (locale_extension.attributes.is_empty() && locale_extension.keywords.is_empty())
+        return {};
+    return locale_extension;
+}
+
+static Optional<Extension> parse_extension(GenericLexer& lexer)
+{
+    // https://unicode.org/reports/tr35/#extensions
+    //
+    // extensions = unicode_locale_extensions | transformed_extensions | other_extensions
+    size_t starting_position = lexer.tell();
+
+    if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) {
+        switch ((*header)[0]) {
+        case 'u':
+        case 'U':
+            if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value())
+                return Extension { extension.release_value() };
+            break;
+
+        default:
+            // FIXME: Handle transformed_extensions / other_extensions
+            break;
+        }
+    }
+
+    lexer.retreat(lexer.tell() - starting_position);
+    return {};
+}
+
 Optional<LanguageID> parse_unicode_language_id(StringView language)
 {
     GenericLexer lexer { language };
@@ -167,7 +287,6 @@ Optional<LanguageID> parse_unicode_language_id(StringView language)
 Optional<LocaleID> parse_unicode_locale_id(StringView locale)
 {
     GenericLexer lexer { locale };
-    LocaleID locale_id {};
 
     // https://unicode.org/reports/tr35/#Unicode_locale_identifier
     //
@@ -178,12 +297,21 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
     if (!language_id.has_value())
         return {};
 
-    // FIXME: Handle extensions and pu_extensions.
+    LocaleID locale_id { language_id.release_value() };
+
+    while (true) {
+        auto extension = parse_extension(lexer);
+        if (!extension.has_value())
+            break;
+        locale_id.extensions.append(extension.release_value());
+    }
+
+    // FIXME: Handle pu_extensions.
 
     if (!lexer.is_eof())
         return {};
 
-    return LocaleID { language_id.release_value() };
+    return locale_id;
 }
 
 Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)

+ 14 - 0
Userland/Libraries/LibUnicode/Locale.h

@@ -9,6 +9,7 @@
 #include <AK/Optional.h>
 #include <AK/String.h>
 #include <AK/StringView.h>
+#include <AK/Variant.h>
 #include <AK/Vector.h>
 #include <LibUnicode/Forward.h>
 
@@ -22,8 +23,21 @@ struct LanguageID {
     Vector<StringView> variants {};
 };
 
+struct Keyword {
+    StringView key {};
+    Vector<StringView> types {};
+};
+
+struct LocaleExtension {
+    Vector<StringView> attributes {};
+    Vector<Keyword> keywords {};
+};
+
+using Extension = Variant<LocaleExtension>;
+
 struct LocaleID {
     LanguageID language_id {};
+    Vector<Extension> extensions {};
 };
 
 // Note: These methods only verify that the provided strings match the EBNF grammar of the