LibTimeZone+LibUnicode: Generate string data with run-length encoding

Currently, the unique string lists are stored in the initialized data
sections of their shared libraries. In order to move the data to the
read-only section, generate the strings using RLE arrays.

We generate two arrays: the first is the RLE data itself, the second is
a list of indices into the RLE array for each string. We then generate a
decoding method to convert an RLE string to a StringView.
This commit is contained in:
Timothy Flynn 2022-08-15 13:01:42 -04:00 committed by Andreas Kling
parent de980de0e4
commit becec3578f
Notes: sideshowbarker 2024-07-17 08:11:40 +09:00
6 changed files with 138 additions and 59 deletions

View file

@ -724,8 +724,8 @@ Optional<Array<NamedOffset, 2>> get_named_time_zone_offsets(TimeZone time_zone,
auto format_name = [](auto format, auto offset) -> String {
if (offset == 0)
return s_string_list[format].replace("{}"sv, ""sv, ReplaceMode::FirstOnly);
return String::formatted(s_string_list[format], s_string_list[offset]);
return decode_string(format).replace("{}"sv, ""sv, ReplaceMode::FirstOnly);
return String::formatted(decode_string(format), decode_string(offset));
};
auto set_named_offset = [&](auto& named_offset, auto dst_offset, auto in_dst, auto format, auto offset) {
@ -776,7 +776,7 @@ Vector<StringView> time_zones_in_region(StringView region)
time_zones.ensure_capacity(regional_time_zones.size());
for (auto time_zone : regional_time_zones)
time_zones.unchecked_append(s_string_list[time_zone]);
time_zones.unchecked_append(decode_string(time_zone));
return time_zones;
}

View file

@ -1811,10 +1811,10 @@ struct CalendarPatternImpl {
CalendarPattern to_unicode_calendar_pattern() const {
CalendarPattern calendar_pattern {};
calendar_pattern.skeleton = s_string_list[skeleton];
calendar_pattern.pattern = s_string_list[pattern];
calendar_pattern.skeleton = decode_string(skeleton);
calendar_pattern.pattern = decode_string(pattern);
if (pattern12 != 0)
calendar_pattern.pattern12 = s_string_list[pattern12];
calendar_pattern.pattern12 = decode_string(pattern12);
convert_calendar_fields(*this, calendar_pattern);
return calendar_pattern;
@ -1843,9 +1843,9 @@ struct CalendarRangePatternImpl {
if (field != -1)
calendar_range_pattern.field = static_cast<CalendarRangePattern::Field>(field);
calendar_range_pattern.start_range = s_string_list[start_range];
calendar_range_pattern.separator = s_string_list[separator];
calendar_range_pattern.end_range = s_string_list[end_range];
calendar_range_pattern.start_range = decode_string(start_range);
calendar_range_pattern.separator = decode_string(separator);
calendar_range_pattern.end_range = decode_string(end_range);
convert_calendar_fields(*this, calendar_range_pattern);
return calendar_range_pattern;
@ -1929,12 +1929,12 @@ struct TimeZoneFormatImpl {
TimeZoneFormat to_time_zone_format() const {
TimeZoneFormat time_zone_format {};
time_zone_format.symbol_ahead_sign = s_string_list[symbol_ahead_sign];
time_zone_format.symbol_ahead_separator = s_string_list[symbol_ahead_separator];
time_zone_format.symbol_behind_sign = s_string_list[symbol_behind_sign];
time_zone_format.symbol_behind_separator = s_string_list[symbol_behind_separator];
time_zone_format.gmt_format = s_string_list[gmt_format];
time_zone_format.gmt_zero_format = s_string_list[gmt_zero_format];
time_zone_format.symbol_ahead_sign = decode_string(symbol_ahead_sign);
time_zone_format.symbol_ahead_separator = decode_string(symbol_ahead_separator);
time_zone_format.symbol_behind_sign = decode_string(symbol_behind_sign);
time_zone_format.symbol_behind_separator = decode_string(symbol_behind_separator);
time_zone_format.gmt_format = decode_string(gmt_format);
time_zone_format.gmt_zero_format = decode_string(gmt_zero_format);
return time_zone_format;
}
@ -2200,7 +2200,7 @@ Vector<CalendarRangePattern> get_calendar_range_formats(StringView locale, Strin
for (auto format : range_formats) {
auto const& pattern = s_calendar_range_patterns[format];
if (skeleton == s_string_list[pattern.skeleton])
if (skeleton == decode_string(pattern.skeleton))
result.append(pattern.to_unicode_calendar_range_pattern());
}
}
@ -2218,7 +2218,7 @@ Vector<CalendarRangePattern> get_calendar_range12_formats(StringView locale, Str
for (auto format : range12_formats) {
auto const& pattern = s_calendar_range_patterns[format];
if (skeleton == s_string_list[pattern.skeleton])
if (skeleton == decode_string(pattern.skeleton))
result.append(pattern.to_unicode_calendar_range_pattern());
}
}
@ -2263,7 +2263,7 @@ Optional<StringView> get_calendar_era_symbol(StringView locale, StringView calen
if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index];
return decode_string(symbol_index);
}
return {};
@ -2275,7 +2275,7 @@ Optional<StringView> get_calendar_month_symbol(StringView locale, StringView cal
if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index];
return decode_string(symbol_index);
}
return {};
@ -2287,7 +2287,7 @@ Optional<StringView> get_calendar_weekday_symbol(StringView locale, StringView c
if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index];
return decode_string(symbol_index);
}
return {};
@ -2299,7 +2299,7 @@ Optional<StringView> get_calendar_day_period_symbol(StringView locale, StringVie
if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index];
return decode_string(symbol_index);
}
return {};
@ -2400,7 +2400,7 @@ Optional<StringView> get_time_zone_name(StringView locale, StringView time_zone,
}
if (name_index != 0)
return s_string_list[name_index];
return decode_string(name_index);
}
return {};

View file

@ -1117,8 +1117,8 @@ struct DisplayPatternImpl {
DisplayPattern to_display_pattern() const
{
DisplayPattern display_patterns {};
display_patterns.locale_pattern = s_string_list[locale_pattern];
display_patterns.locale_separator = s_string_list[locale_separator];
display_patterns.locale_pattern = decode_string(locale_pattern);
display_patterns.locale_separator = decode_string(locale_separator);
return display_patterns;
}
@ -1266,13 +1266,13 @@ struct CanonicalLanguageID {
LanguageID language_id {};
language_id.variants.ensure_capacity(variants_size);
language_id.language = s_string_list[language];
language_id.language = decode_string(language);
if (script != 0)
language_id.script = s_string_list[script];
language_id.script = decode_string(script);
if (region != 0)
language_id.region = s_string_list[region];
language_id.region = decode_string(region);
for (size_t i = 0; i < variants_size; ++i)
language_id.variants.append(s_string_list[variants[i]]);
language_id.variants.append(decode_string(variants[i]));
return language_id;
}
@ -1284,7 +1284,7 @@ struct CanonicalLanguageID {
return false;
for (size_t i = 0; i < variants_size; ++i) {
if (s_string_list[variants[i]] != other_variants[i])
if (decode_string(variants[i]) != other_variants[i])
return false;
}
@ -1415,9 +1415,9 @@ static LanguageMapping const* resolve_likely_subtag(LanguageID const& language_i
}
for (auto const& map : s_likely_subtags) {
auto const& key_language = s_string_list[map.key.language];
auto const& key_script = s_string_list[map.key.script];
auto const& key_region = s_string_list[map.key.region];
auto const& key_language = decode_string(map.key.language);
auto const& key_script = decode_string(map.key.script);
auto const& key_region = decode_string(map.key.region);
if (key_language != search_key.language)
continue;
@ -1463,7 +1463,7 @@ Optional<StringView> get_locale_@enum_snake@_mapping(StringView locale, StringVi
auto const& mappings = @unique_list@.at(mapping_index);
auto @enum_snake@_string_index = mappings.at(@enum_snake@_index);
auto @enum_snake@_mapping = s_string_list.at(@enum_snake@_string_index);
auto @enum_snake@_mapping = decode_string(@enum_snake@_string_index);
if (@enum_snake@_mapping.is_empty())
return {};
@ -1493,7 +1493,7 @@ Optional<StringView> get_locale_@enum_snake@_mapping(StringView locale, StringVi
ValueFromStringOptions options {};
options.return_type = "StringView"sv;
options.return_format = "s_string_list[{}]"sv;
options.return_format = "decode_string({})"sv;
generate_value_from_string(generator, "resolve_{}_alias"sv, s_string_index_type, enum_snake, move(hashes), options);
};
@ -1606,7 +1606,7 @@ Optional<StringView> get_preferred_keyword_value_for_locale(StringView locale, S
if (keyword_indices.is_empty())
return {};
return s_string_list[keyword_indices[0]];
return decode_string(keyword_indices[0]);
}
Vector<StringView> get_keywords_for_locale(StringView locale, StringView key)
@ -1636,7 +1636,7 @@ Vector<StringView> get_keywords_for_locale(StringView locale, StringView key)
keywords.ensure_capacity(keyword_indices.size());
for (auto keyword : keyword_indices)
keywords.unchecked_append(s_string_list[keyword]);
keywords.unchecked_append(decode_string(keyword));
return keywords;
}
@ -1673,10 +1673,10 @@ Optional<ListPatterns> get_locale_list_patterns(StringView locale, StringView li
auto const& list_patterns = s_list_patterns.at(list_patterns_index);
if ((list_patterns.type == type_value) && (list_patterns.style == list_pattern_style)) {
auto const& start = s_string_list[list_patterns.start];
auto const& middle = s_string_list[list_patterns.middle];
auto const& end = s_string_list[list_patterns.end];
auto const& pair = s_string_list[list_patterns.pair];
auto const& start = decode_string(list_patterns.start);
auto const& middle = decode_string(list_patterns.middle);
auto const& end = decode_string(list_patterns.end);
auto const& pair = decode_string(list_patterns.pair);
return ListPatterns { start, middle, end, pair };
}
@ -1707,9 +1707,9 @@ Optional<CharacterOrder> character_order_for_locale(StringView locale)
void resolve_complex_language_aliases(LanguageID& language_id)
{
for (auto const& map : s_complex_alias) {
auto const& key_language = s_string_list[map.key.language];
auto const& key_script = s_string_list[map.key.script];
auto const& key_region = s_string_list[map.key.region];
auto const& key_language = decode_string(map.key.language);
auto const& key_script = decode_string(map.key.script);
auto const& key_region = decode_string(map.key.region);
if ((key_language != language_id.language) && (key_language != "und"sv))
continue;
@ -1745,12 +1745,12 @@ Optional<LanguageID> add_likely_subtags(LanguageID const& language_id)
auto maximized = language_id;
auto const& key_script = s_string_list[likely_subtag->key.script];
auto const& key_region = s_string_list[likely_subtag->key.region];
auto const& key_script = decode_string(likely_subtag->key.script);
auto const& key_region = decode_string(likely_subtag->key.region);
auto const& alias_language = s_string_list[likely_subtag->alias.language];
auto const& alias_script = s_string_list[likely_subtag->alias.script];
auto const& alias_region = s_string_list[likely_subtag->alias.region];
auto const& alias_language = decode_string(likely_subtag->alias.language);
auto const& alias_script = decode_string(likely_subtag->alias.script);
auto const& alias_region = decode_string(likely_subtag->alias.region);
if (maximized.language == "und"sv)
maximized.language = alias_language;
@ -1765,7 +1765,7 @@ Optional<LanguageID> add_likely_subtags(LanguageID const& language_id)
Optional<String> resolve_most_likely_territory(LanguageID const& language_id)
{
if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr)
return s_string_list[likely_subtag->alias.region];
return decode_string(likely_subtag->alias.region);
return {};
}

View file

@ -825,13 +825,13 @@ struct NumberFormatImpl {
number_format.magnitude = magnitude;
number_format.exponent = exponent;
number_format.plurality = static_cast<PluralCategory>(plurality);
number_format.zero_format = s_string_list[zero_format];
number_format.positive_format = s_string_list[positive_format];
number_format.negative_format = s_string_list[negative_format];
number_format.zero_format = decode_string(zero_format);
number_format.positive_format = decode_string(positive_format);
number_format.negative_format = decode_string(negative_format);
number_format.identifiers.ensure_capacity(identifiers.size());
for (@string_index_type@ identifier : identifiers)
number_format.identifiers.append(s_string_list[identifier]);
number_format.identifiers.append(decode_string(identifier));
return number_format;
}
@ -996,7 +996,7 @@ Optional<StringView> get_number_system_symbol(StringView locale, StringView syst
if (symbol_index >= symbols.size())
return {};
return s_string_list[symbols[symbol_index]];
return decode_string(symbols[symbol_index]);
}
return {};
@ -1088,7 +1088,7 @@ static Unit const* find_units(StringView locale, StringView unit)
for (auto unit_index : locale_units) {
auto const& units = s_units.at(unit_index);
if (unit == s_string_list[units.unit])
if (unit == decode_string(units.unit))
return &units;
};

View file

@ -218,7 +218,7 @@ struct RelativeTimeFormatImpl {
{
RelativeTimeFormat relative_time_format {};
relative_time_format.plurality = plurality;
relative_time_format.pattern = s_string_list[pattern];
relative_time_format.pattern = decode_string(pattern);
return relative_time_format;
}
@ -271,7 +271,7 @@ Vector<RelativeTimeFormat> get_relative_time_format_patterns(StringView locale,
continue;
if (locale_format.style != style)
continue;
if (s_string_list[locale_format.tense_or_number] != tense_or_number)
if (decode_string(locale_format.tense_or_number) != tense_or_number)
continue;
formats.append(locale_format.to_relative_time_format());

View file

@ -11,6 +11,7 @@
#include <AK/HashMap.h>
#include <AK/JsonValue.h>
#include <AK/LexicalPath.h>
#include <AK/NumericLimits.h>
#include <AK/Optional.h>
#include <AK/QuickSort.h>
#include <AK/SourceGenerator.h>
@ -174,7 +175,7 @@ static constexpr Array<Span<@type@ const>, @size@ + 1> @name@ { {
// clang-format off
// clang-format gets confused by the requires() clauses above, and formats this section very weirdly.
private:
protected:
Vector<StorageType> m_storage;
HashMap<StorageType, IndexType> m_storage_indices;
// clang-format on
@ -185,9 +186,87 @@ class UniqueStringStorage : public UniqueStorage<String, StringIndexType> {
using Base = UniqueStorage<String, StringIndexType>;
public:
// The goal of the string table generator is to ensure the table is located within the read-only
// section of the shared library. If StringViews are generated directly, the table will be located
// in the initialized data section. So instead, we generate run-length encoded (RLE) arrays to
// represent the strings.
void generate(SourceGenerator& generator)
{
Base::generate(generator, "StringView"sv, "s_string_list"sv, 40);
constexpr size_t max_values_per_row = 300;
size_t values_in_current_row = 0;
auto append_hex_value = [&](auto value) {
if (values_in_current_row++ > 0)
generator.append(", ");
generator.append(String::formatted("{:#x}", value));
if (values_in_current_row == max_values_per_row) {
values_in_current_row = 0;
generator.append(",\n ");
}
};
Vector<u32> string_indices;
string_indices.ensure_capacity(Base::m_storage.size());
u32 next_index { 0 };
for (auto const& string : Base::m_storage) {
// Ensure the string length may be encoded as two u8s.
VERIFY(string.length() <= NumericLimits<u16>::max());
string_indices.unchecked_append(next_index);
next_index += string.length() + 2;
}
generator.set("size", String::number(next_index));
generator.append(R"~~~(
static constexpr Array<u8, @size@> s_encoded_strings { {
)~~~");
for (auto const& string : Base::m_storage) {
auto length = string.length();
append_hex_value((length & 0xff00) >> 8);
append_hex_value(length & 0x00ff);
for (auto ch : string)
append_hex_value(static_cast<u8>(ch));
}
generator.append(R"~~~(
} };
)~~~");
generator.set("size", String::number(string_indices.size()));
generator.append(R"~~~(
static constexpr Array<u32, @size@> s_encoded_string_indices { {
)~~~");
values_in_current_row = 0;
for (auto index : string_indices)
append_hex_value(index);
generator.append(R"~~~(
} };
static constexpr StringView decode_string(size_t index)
{
if (index == 0)
return {};
index = s_encoded_string_indices[index - 1];
auto length_high = s_encoded_strings[index];
auto length_low = s_encoded_strings[index + 1];
size_t length = (length_high << 8) | length_low;
if (length == 0)
return {};
auto const* start = &s_encoded_strings[index + 2];
return { reinterpret_cast<char const*>(start), length };
}
)~~~");
}
};