LibJS+LibUnicode: Support multiple identifiers within format pattern

This wasn't the case for compact patterns, but unit patterns can contain
multiple (up to 2, really) identifiers that must each be recognized by
LibJS.

Each generated NumberFormat object now stores an array of identifiers
parsed. The format pattern itself is encoded with the index into this
array for that identifier, e.g. the compact format string "0K" will
become "{number}{compactIdentifier:0}".
This commit is contained in:
Timothy Flynn 2021-11-16 13:53:45 -05:00 committed by Linus Groh
parent 3b68370212
commit 04b8b87c17
Notes: sideshowbarker 2024-07-18 01:04:14 +09:00
3 changed files with 72 additions and 40 deletions

View file

@ -57,7 +57,7 @@ struct NumberFormat : public Unicode::NumberFormat {
StringIndexType zero_format_index { 0 };
StringIndexType positive_format_index { 0 };
StringIndexType negative_format_index { 0 };
StringIndexType identifier_index { 0 };
Vector<StringIndexType> identifier_indices {};
};
struct NumberSystem {
@ -88,46 +88,61 @@ struct UnicodeLocaleData {
UniqueStringStorage<StringIndexType> unique_strings;
HashMap<String, Locale> locales;
Vector<String> numeric_symbols;
size_t max_identifier_count { 0 };
};
static String parse_identifier(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format)
static String parse_identifiers(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format)
{
static Utf8View whitespace { "\u0020\u00a0"sv };
Utf8View utf8_pattern { pattern };
Optional<size_t> start_index;
Optional<size_t> end_index;
bool inside_replacement = false;
while (true) {
Utf8View utf8_pattern { pattern };
Optional<size_t> start_index;
Optional<size_t> end_index;
bool inside_replacement = false;
for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) {
if (*it == '{') {
if (start_index.has_value()) {
end_index = utf8_pattern.byte_offset_of(it);
break;
for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) {
if (*it == '{') {
if (start_index.has_value()) {
end_index = utf8_pattern.byte_offset_of(it);
break;
}
inside_replacement = true;
} else if (*it == '}') {
inside_replacement = false;
} else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) {
start_index = utf8_pattern.byte_offset_of(it);
}
inside_replacement = true;
} else if (*it == '}') {
inside_replacement = false;
} else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) {
start_index = utf8_pattern.byte_offset_of(it);
}
if (!start_index.has_value())
return pattern;
end_index = end_index.value_or(pattern.length());
utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index);
utf8_pattern = utf8_pattern.trim(whitespace);
auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv);
auto identifier_index = locale_data.unique_strings.ensure(move(identifier));
size_t replacement_index = 0;
if (auto index = format.identifier_indices.find_first_index(identifier_index); index.has_value()) {
replacement_index = *index;
} else {
replacement_index = format.identifier_indices.size();
format.identifier_indices.append(identifier_index);
locale_data.max_identifier_count = max(locale_data.max_identifier_count, format.identifier_indices.size());
}
pattern = String::formatted("{}{{{}:{}}}{}",
*start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv,
replacement,
replacement_index,
pattern.substring_view(*start_index + utf8_pattern.byte_length()));
}
if (!start_index.has_value())
return pattern;
end_index = end_index.value_or(pattern.length());
utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index);
utf8_pattern = utf8_pattern.trim(whitespace);
auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv);
format.identifier_index = locale_data.unique_strings.ensure(move(identifier));
return String::formatted("{}{}{}",
*start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv,
replacement,
pattern.substring_view(*start_index + utf8_pattern.byte_length()));
}
static void parse_number_pattern(Vector<String> patterns, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr)
@ -188,7 +203,7 @@ static void parse_number_pattern(Vector<String> patterns, UnicodeLocaleData& loc
}
if (type == NumberFormatType::Compact)
return parse_identifier(move(pattern), "{compactIdentifier}"sv, locale_data, format);
return parse_identifiers(move(pattern), "compactIdentifier"sv, locale_data, format);
return pattern;
};
@ -413,6 +428,7 @@ static void generate_unicode_locale_implementation(Core::File& file, UnicodeLoca
SourceGenerator generator { builder };
generator.set("string_index_type"sv, s_string_index_type);
generator.set("numeric_symbols_size", String::number(locale_data.numeric_symbols.size()));
generator.set("identifier_count", String::number(locale_data.max_identifier_count));
generator.append(R"~~~(
#include <AK/Array.h>
@ -437,7 +453,10 @@ struct NumberFormat {
number_format.zero_format = s_string_list[zero_format];
number_format.positive_format = s_string_list[positive_format];
number_format.negative_format = s_string_list[negative_format];
number_format.identifier = s_string_list[identifier];
number_format.identifiers.ensure_capacity(identifiers.size());
for (@string_index_type@ identifier : identifiers)
number_format.identifiers.append(s_string_list[identifier]);
return number_format;
}
@ -448,7 +467,7 @@ struct NumberFormat {
@string_index_type@ zero_format { 0 };
@string_index_type@ positive_format { 0 };
@string_index_type@ negative_format { 0 };
@string_index_type@ identifier { 0 };
Array<@string_index_type@, @identifier_count@> identifiers {};
};
struct NumberSystem {
@ -479,8 +498,18 @@ struct NumberSystem {
generator.set("zero_format"sv, String::number(number_format.zero_format_index));
generator.set("positive_format"sv, String::number(number_format.positive_format_index));
generator.set("negative_format"sv, String::number(number_format.negative_format_index));
generator.set("identifier"sv, String::number(number_format.identifier_index));
generator.append("{ @magnitude@, @exponent@, @plurality@, @zero_format@, @positive_format@, @negative_format@, @identifier@ },");
generator.append("{ @magnitude@, @exponent@, @plurality@, @zero_format@, @positive_format@, @negative_format@, { ");
bool first = true;
for (auto identifier_index : number_format.identifier_indices) {
if (!first)
generator.append(", ");
generator.append(String::number(identifier_index));
first = false;
}
generator.append(" } },");
};
auto append_number_formats = [&](String name, auto const& number_formats) {

View file

@ -962,11 +962,14 @@ Vector<PatternPartition> partition_notation_sub_pattern(NumberFormat& number_for
}
// iv. Else if p is equal to "compactSymbol", then
// v. Else if p is equal to "compactName", then
else if (part == "compactIdentifier"sv) {
else if (part.starts_with("compactIdentifier:"sv)) {
// Note: Our implementation combines "compactSymbol" and "compactName" into one field, "compactIdentifier".
auto identifier_index = part.substring_view("compactIdentifier:"sv.length()).to_uint();
VERIFY(identifier_index.has_value());
// 1. Let compactSymbol be an ILD string representing exponent in short form, which may depend on x in languages having different plural forms. The implementation must be able to provide this string, or else the pattern would not have a "{compactSymbol}" placeholder.
auto compact_identifier = number_format.compact_format().identifier;
auto compact_identifier = number_format.compact_format().identifiers[*identifier_index];
// 2. Append a new Record { [[Type]]: "compact", [[Value]]: compactSymbol } as the last element of result.
result.append({ "compact"sv, compact_identifier });

View file

@ -122,7 +122,7 @@ struct NumberFormat {
StringView zero_format {};
StringView positive_format {};
StringView negative_format {};
StringView identifier {};
Vector<StringView> identifiers {};
};
struct ListPatterns {