LibUnicode: Parse locale extensions of the transformed extension form
This commit is contained in:
parent
eda92d15e4
commit
d2d304fcf8
Notes:
sideshowbarker
2024-07-18 05:04:40 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/d2d304fcf88 Pull-request: https://github.com/SerenityOS/serenity/pull/9668 Reviewed-by: https://github.com/linusg
3 changed files with 176 additions and 2 deletions
|
@ -146,6 +146,72 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension)
|
|||
pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } });
|
||||
}
|
||||
|
||||
TEST_CASE(parse_unicode_locale_id_with_transformed_extension)
|
||||
{
|
||||
auto fail = [](StringView locale) {
|
||||
auto locale_id = Unicode::parse_unicode_locale_id(locale);
|
||||
EXPECT(!locale_id.has_value());
|
||||
};
|
||||
auto pass = [](StringView locale, Unicode::TransformedExtension const& expected_extension) {
|
||||
auto locale_id = Unicode::parse_unicode_locale_id(locale);
|
||||
VERIFY(locale_id.has_value());
|
||||
EXPECT_EQ(locale_id->extensions.size(), 1u);
|
||||
|
||||
auto const& actual_extension = locale_id->extensions[0].get<Unicode::TransformedExtension>();
|
||||
|
||||
VERIFY(actual_extension.language.has_value() == expected_extension.language.has_value());
|
||||
if (actual_extension.language.has_value()) {
|
||||
EXPECT_EQ(actual_extension.language->language, expected_extension.language->language);
|
||||
EXPECT_EQ(actual_extension.language->script, expected_extension.language->script);
|
||||
EXPECT_EQ(actual_extension.language->region, expected_extension.language->region);
|
||||
EXPECT_EQ(actual_extension.language->variants, expected_extension.language->variants);
|
||||
}
|
||||
|
||||
EXPECT_EQ(actual_extension.fields.size(), expected_extension.fields.size());
|
||||
|
||||
for (size_t i = 0; i < actual_extension.fields.size(); ++i) {
|
||||
auto const& actual_field = actual_extension.fields[i];
|
||||
auto const& expected_field = expected_extension.fields[i];
|
||||
|
||||
EXPECT_EQ(actual_field.key, expected_field.key);
|
||||
EXPECT_EQ(actual_field.values, expected_field.values);
|
||||
}
|
||||
};
|
||||
|
||||
fail("en-t"sv);
|
||||
fail("en-t-"sv);
|
||||
fail("en-t-a"sv);
|
||||
fail("en-t-en-"sv);
|
||||
fail("en-t-root"sv);
|
||||
fail("en-t-aaaaaaaaa"sv);
|
||||
fail("en-t-en-aaa"sv);
|
||||
fail("en-t-en-latn-latn"sv);
|
||||
fail("en-t-en-a"sv);
|
||||
fail("en-t-en-00"sv);
|
||||
fail("en-t-en-latn-0"sv);
|
||||
fail("en-t-en-latn-00"sv);
|
||||
fail("en-t-en-latn-xyz"sv);
|
||||
fail("en-t-en-aaaaaaaaa"sv);
|
||||
fail("en-t-en-latn-gb-aaaa"sv);
|
||||
fail("en-t-en-latn-gb-aaaaaaaaa"sv);
|
||||
fail("en-t-k0"sv);
|
||||
fail("en-t-k0-aa"sv);
|
||||
fail("en-t-k0-aaaaaaaaa"sv);
|
||||
|
||||
pass("en-t-en"sv, { Unicode::LanguageID { false, "en"sv }, {} });
|
||||
pass("en-t-en-latn"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv }, {} });
|
||||
pass("en-t-en-us"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv }, {} });
|
||||
pass("en-t-en-latn-us"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv }, {} });
|
||||
pass("en-t-en-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, {}, { "posix"sv } }, {} });
|
||||
pass("en-t-en-latn-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, {}, { "posix"sv } }, {} });
|
||||
pass("en-t-en-us-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv, { "posix"sv } }, {} });
|
||||
pass("en-t-en-latn-us-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv, { "posix"sv } }, {} });
|
||||
pass("en-t-k0-aaa"sv, { {}, { { "k0"sv, { "aaa"sv } } } });
|
||||
pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv, "bbbb" } } } });
|
||||
pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, { "bbbb"sv } } } });
|
||||
pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } });
|
||||
}
|
||||
|
||||
TEST_CASE(canonicalize_unicode_locale_id)
|
||||
{
|
||||
auto test = [](StringView locale, StringView expected_canonical_locale) {
|
||||
|
|
|
@ -78,6 +78,23 @@ static bool is_attribute(StringView type)
|
|||
return all_of(type, is_ascii_alphanumeric);
|
||||
}
|
||||
|
||||
static bool is_transformed_key(StringView key)
|
||||
{
|
||||
// tkey = alpha digit
|
||||
if (key.length() != 2)
|
||||
return false;
|
||||
return is_ascii_alpha(key[0]) && is_ascii_digit(key[1]);
|
||||
}
|
||||
|
||||
static bool is_single_transformed_value(StringView value)
|
||||
{
|
||||
// tvalue = (sep alphanum{3,8})+
|
||||
// Note: Consecutive values are not handled here, that is left to the caller.
|
||||
if ((value.length() < 3) || (value.length() > 8))
|
||||
return false;
|
||||
return all_of(value, is_ascii_alphanumeric);
|
||||
}
|
||||
|
||||
static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator = true)
|
||||
{
|
||||
constexpr auto is_separator = is_any_of("-_"sv);
|
||||
|
@ -248,6 +265,81 @@ static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& le
|
|||
return locale_extension;
|
||||
}
|
||||
|
||||
static Optional<TransformedExtension> parse_transformed_extension(GenericLexer& lexer)
|
||||
{
|
||||
// https://unicode.org/reports/tr35/#transformed_extensions
|
||||
//
|
||||
// transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | (sep tfield)+)
|
||||
TransformedExtension transformed_extension {};
|
||||
|
||||
enum class ParseState {
|
||||
ParsingLanguageOrField,
|
||||
ParsingLanguage,
|
||||
ParsingField,
|
||||
Done,
|
||||
};
|
||||
|
||||
auto state = ParseState::ParsingLanguageOrField;
|
||||
|
||||
while (!lexer.is_eof() && (state != ParseState::Done)) {
|
||||
auto segment = consume_next_segment(lexer);
|
||||
if (!segment.has_value())
|
||||
return {};
|
||||
|
||||
if (state == ParseState::ParsingLanguageOrField)
|
||||
state = is_unicode_language_subtag(*segment) ? ParseState::ParsingLanguage : ParseState::ParsingField;
|
||||
|
||||
switch (state) {
|
||||
case ParseState::ParsingLanguage:
|
||||
lexer.retreat(segment->length());
|
||||
|
||||
if (auto language_id = parse_unicode_language_id(lexer); language_id.has_value()) {
|
||||
transformed_extension.language = language_id.release_value();
|
||||
state = ParseState::ParsingField;
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
|
||||
case ParseState::ParsingField: {
|
||||
// tfield = tkey tvalue;
|
||||
TransformedField field { .key = *segment };
|
||||
|
||||
if (!is_transformed_key(*segment)) {
|
||||
lexer.retreat(segment->length() + 1);
|
||||
state = ParseState::Done;
|
||||
break;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto value = consume_next_segment(lexer);
|
||||
|
||||
if (!value.has_value() || !is_single_transformed_value(*value)) {
|
||||
if (value.has_value())
|
||||
lexer.retreat(value->length() + 1);
|
||||
break;
|
||||
}
|
||||
|
||||
field.values.append(*value);
|
||||
}
|
||||
|
||||
if (field.values.is_empty())
|
||||
return {};
|
||||
|
||||
transformed_extension.fields.append(move(field));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
}
|
||||
|
||||
if (!transformed_extension.language.has_value() && transformed_extension.fields.is_empty())
|
||||
return {};
|
||||
return transformed_extension;
|
||||
}
|
||||
|
||||
static Optional<Extension> parse_extension(GenericLexer& lexer)
|
||||
{
|
||||
// https://unicode.org/reports/tr35/#extensions
|
||||
|
@ -263,8 +355,14 @@ static Optional<Extension> parse_extension(GenericLexer& lexer)
|
|||
return Extension { extension.release_value() };
|
||||
break;
|
||||
|
||||
case 't':
|
||||
case 'T':
|
||||
if (auto extension = parse_transformed_extension(lexer); extension.has_value())
|
||||
return Extension { extension.release_value() };
|
||||
break;
|
||||
|
||||
default:
|
||||
// FIXME: Handle transformed_extensions / other_extensions
|
||||
// FIXME: Handle other_extensions
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,7 +33,17 @@ struct LocaleExtension {
|
|||
Vector<Keyword> keywords {};
|
||||
};
|
||||
|
||||
using Extension = Variant<LocaleExtension>;
|
||||
struct TransformedField {
|
||||
StringView key;
|
||||
Vector<StringView> values {};
|
||||
};
|
||||
|
||||
struct TransformedExtension {
|
||||
Optional<LanguageID> language {};
|
||||
Vector<TransformedField> fields {};
|
||||
};
|
||||
|
||||
using Extension = Variant<LocaleExtension, TransformedExtension>;
|
||||
|
||||
struct LocaleID {
|
||||
LanguageID language_id {};
|
||||
|
|
Loading…
Add table
Reference in a new issue