LibTimeZone+LibUnicode: Generate string data with run-length encoding

Currently, the unique string lists are stored in the initialized data
sections of their shared libraries. In order to move the data to the
read-only section, generate the strings using RLE arrays.

We generate two arrays: the first is the RLE data itself, the second is
a list of indices into the RLE array for each string. We then generate a
decoding method to convert an RLE string to a StringView.
This commit is contained in:
Timothy Flynn 2022-08-15 13:01:42 -04:00 committed by Andreas Kling
parent de980de0e4
commit becec3578f
Notes: sideshowbarker 2024-07-17 08:11:40 +09:00
6 changed files with 138 additions and 59 deletions

View file

@ -724,8 +724,8 @@ Optional<Array<NamedOffset, 2>> get_named_time_zone_offsets(TimeZone time_zone,
auto format_name = [](auto format, auto offset) -> String { auto format_name = [](auto format, auto offset) -> String {
if (offset == 0) if (offset == 0)
return s_string_list[format].replace("{}"sv, ""sv, ReplaceMode::FirstOnly); return decode_string(format).replace("{}"sv, ""sv, ReplaceMode::FirstOnly);
return String::formatted(s_string_list[format], s_string_list[offset]); return String::formatted(decode_string(format), decode_string(offset));
}; };
auto set_named_offset = [&](auto& named_offset, auto dst_offset, auto in_dst, auto format, auto offset) { auto set_named_offset = [&](auto& named_offset, auto dst_offset, auto in_dst, auto format, auto offset) {
@ -776,7 +776,7 @@ Vector<StringView> time_zones_in_region(StringView region)
time_zones.ensure_capacity(regional_time_zones.size()); time_zones.ensure_capacity(regional_time_zones.size());
for (auto time_zone : regional_time_zones) for (auto time_zone : regional_time_zones)
time_zones.unchecked_append(s_string_list[time_zone]); time_zones.unchecked_append(decode_string(time_zone));
return time_zones; return time_zones;
} }

View file

@ -1811,10 +1811,10 @@ struct CalendarPatternImpl {
CalendarPattern to_unicode_calendar_pattern() const { CalendarPattern to_unicode_calendar_pattern() const {
CalendarPattern calendar_pattern {}; CalendarPattern calendar_pattern {};
calendar_pattern.skeleton = s_string_list[skeleton]; calendar_pattern.skeleton = decode_string(skeleton);
calendar_pattern.pattern = s_string_list[pattern]; calendar_pattern.pattern = decode_string(pattern);
if (pattern12 != 0) if (pattern12 != 0)
calendar_pattern.pattern12 = s_string_list[pattern12]; calendar_pattern.pattern12 = decode_string(pattern12);
convert_calendar_fields(*this, calendar_pattern); convert_calendar_fields(*this, calendar_pattern);
return calendar_pattern; return calendar_pattern;
@ -1843,9 +1843,9 @@ struct CalendarRangePatternImpl {
if (field != -1) if (field != -1)
calendar_range_pattern.field = static_cast<CalendarRangePattern::Field>(field); calendar_range_pattern.field = static_cast<CalendarRangePattern::Field>(field);
calendar_range_pattern.start_range = s_string_list[start_range]; calendar_range_pattern.start_range = decode_string(start_range);
calendar_range_pattern.separator = s_string_list[separator]; calendar_range_pattern.separator = decode_string(separator);
calendar_range_pattern.end_range = s_string_list[end_range]; calendar_range_pattern.end_range = decode_string(end_range);
convert_calendar_fields(*this, calendar_range_pattern); convert_calendar_fields(*this, calendar_range_pattern);
return calendar_range_pattern; return calendar_range_pattern;
@ -1929,12 +1929,12 @@ struct TimeZoneFormatImpl {
TimeZoneFormat to_time_zone_format() const { TimeZoneFormat to_time_zone_format() const {
TimeZoneFormat time_zone_format {}; TimeZoneFormat time_zone_format {};
time_zone_format.symbol_ahead_sign = s_string_list[symbol_ahead_sign]; time_zone_format.symbol_ahead_sign = decode_string(symbol_ahead_sign);
time_zone_format.symbol_ahead_separator = s_string_list[symbol_ahead_separator]; time_zone_format.symbol_ahead_separator = decode_string(symbol_ahead_separator);
time_zone_format.symbol_behind_sign = s_string_list[symbol_behind_sign]; time_zone_format.symbol_behind_sign = decode_string(symbol_behind_sign);
time_zone_format.symbol_behind_separator = s_string_list[symbol_behind_separator]; time_zone_format.symbol_behind_separator = decode_string(symbol_behind_separator);
time_zone_format.gmt_format = s_string_list[gmt_format]; time_zone_format.gmt_format = decode_string(gmt_format);
time_zone_format.gmt_zero_format = s_string_list[gmt_zero_format]; time_zone_format.gmt_zero_format = decode_string(gmt_zero_format);
return time_zone_format; return time_zone_format;
} }
@ -2200,7 +2200,7 @@ Vector<CalendarRangePattern> get_calendar_range_formats(StringView locale, Strin
for (auto format : range_formats) { for (auto format : range_formats) {
auto const& pattern = s_calendar_range_patterns[format]; auto const& pattern = s_calendar_range_patterns[format];
if (skeleton == s_string_list[pattern.skeleton]) if (skeleton == decode_string(pattern.skeleton))
result.append(pattern.to_unicode_calendar_range_pattern()); result.append(pattern.to_unicode_calendar_range_pattern());
} }
} }
@ -2218,7 +2218,7 @@ Vector<CalendarRangePattern> get_calendar_range12_formats(StringView locale, Str
for (auto format : range12_formats) { for (auto format : range12_formats) {
auto const& pattern = s_calendar_range_patterns[format]; auto const& pattern = s_calendar_range_patterns[format];
if (skeleton == s_string_list[pattern.skeleton]) if (skeleton == decode_string(pattern.skeleton))
result.append(pattern.to_unicode_calendar_range_pattern()); result.append(pattern.to_unicode_calendar_range_pattern());
} }
} }
@ -2263,7 +2263,7 @@ Optional<StringView> get_calendar_era_symbol(StringView locale, StringView calen
if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0) if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index]; return decode_string(symbol_index);
} }
return {}; return {};
@ -2275,7 +2275,7 @@ Optional<StringView> get_calendar_month_symbol(StringView locale, StringView cal
if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0) if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index]; return decode_string(symbol_index);
} }
return {}; return {};
@ -2287,7 +2287,7 @@ Optional<StringView> get_calendar_weekday_symbol(StringView locale, StringView c
if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0) if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index]; return decode_string(symbol_index);
} }
return {}; return {};
@ -2299,7 +2299,7 @@ Optional<StringView> get_calendar_day_period_symbol(StringView locale, StringVie
if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto value_index = to_underlying(value); value_index < symbols.size()) {
if (auto symbol_index = symbols.at(value_index); symbol_index != 0) if (auto symbol_index = symbols.at(value_index); symbol_index != 0)
return s_string_list[symbol_index]; return decode_string(symbol_index);
} }
return {}; return {};
@ -2400,7 +2400,7 @@ Optional<StringView> get_time_zone_name(StringView locale, StringView time_zone,
} }
if (name_index != 0) if (name_index != 0)
return s_string_list[name_index]; return decode_string(name_index);
} }
return {}; return {};

View file

@ -1117,8 +1117,8 @@ struct DisplayPatternImpl {
DisplayPattern to_display_pattern() const DisplayPattern to_display_pattern() const
{ {
DisplayPattern display_patterns {}; DisplayPattern display_patterns {};
display_patterns.locale_pattern = s_string_list[locale_pattern]; display_patterns.locale_pattern = decode_string(locale_pattern);
display_patterns.locale_separator = s_string_list[locale_separator]; display_patterns.locale_separator = decode_string(locale_separator);
return display_patterns; return display_patterns;
} }
@ -1266,13 +1266,13 @@ struct CanonicalLanguageID {
LanguageID language_id {}; LanguageID language_id {};
language_id.variants.ensure_capacity(variants_size); language_id.variants.ensure_capacity(variants_size);
language_id.language = s_string_list[language]; language_id.language = decode_string(language);
if (script != 0) if (script != 0)
language_id.script = s_string_list[script]; language_id.script = decode_string(script);
if (region != 0) if (region != 0)
language_id.region = s_string_list[region]; language_id.region = decode_string(region);
for (size_t i = 0; i < variants_size; ++i) for (size_t i = 0; i < variants_size; ++i)
language_id.variants.append(s_string_list[variants[i]]); language_id.variants.append(decode_string(variants[i]));
return language_id; return language_id;
} }
@ -1284,7 +1284,7 @@ struct CanonicalLanguageID {
return false; return false;
for (size_t i = 0; i < variants_size; ++i) { for (size_t i = 0; i < variants_size; ++i) {
if (s_string_list[variants[i]] != other_variants[i]) if (decode_string(variants[i]) != other_variants[i])
return false; return false;
} }
@ -1415,9 +1415,9 @@ static LanguageMapping const* resolve_likely_subtag(LanguageID const& language_i
} }
for (auto const& map : s_likely_subtags) { for (auto const& map : s_likely_subtags) {
auto const& key_language = s_string_list[map.key.language]; auto const& key_language = decode_string(map.key.language);
auto const& key_script = s_string_list[map.key.script]; auto const& key_script = decode_string(map.key.script);
auto const& key_region = s_string_list[map.key.region]; auto const& key_region = decode_string(map.key.region);
if (key_language != search_key.language) if (key_language != search_key.language)
continue; continue;
@ -1463,7 +1463,7 @@ Optional<StringView> get_locale_@enum_snake@_mapping(StringView locale, StringVi
auto const& mappings = @unique_list@.at(mapping_index); auto const& mappings = @unique_list@.at(mapping_index);
auto @enum_snake@_string_index = mappings.at(@enum_snake@_index); auto @enum_snake@_string_index = mappings.at(@enum_snake@_index);
auto @enum_snake@_mapping = s_string_list.at(@enum_snake@_string_index); auto @enum_snake@_mapping = decode_string(@enum_snake@_string_index);
if (@enum_snake@_mapping.is_empty()) if (@enum_snake@_mapping.is_empty())
return {}; return {};
@ -1493,7 +1493,7 @@ Optional<StringView> get_locale_@enum_snake@_mapping(StringView locale, StringVi
ValueFromStringOptions options {}; ValueFromStringOptions options {};
options.return_type = "StringView"sv; options.return_type = "StringView"sv;
options.return_format = "s_string_list[{}]"sv; options.return_format = "decode_string({})"sv;
generate_value_from_string(generator, "resolve_{}_alias"sv, s_string_index_type, enum_snake, move(hashes), options); generate_value_from_string(generator, "resolve_{}_alias"sv, s_string_index_type, enum_snake, move(hashes), options);
}; };
@ -1606,7 +1606,7 @@ Optional<StringView> get_preferred_keyword_value_for_locale(StringView locale, S
if (keyword_indices.is_empty()) if (keyword_indices.is_empty())
return {}; return {};
return s_string_list[keyword_indices[0]]; return decode_string(keyword_indices[0]);
} }
Vector<StringView> get_keywords_for_locale(StringView locale, StringView key) Vector<StringView> get_keywords_for_locale(StringView locale, StringView key)
@ -1636,7 +1636,7 @@ Vector<StringView> get_keywords_for_locale(StringView locale, StringView key)
keywords.ensure_capacity(keyword_indices.size()); keywords.ensure_capacity(keyword_indices.size());
for (auto keyword : keyword_indices) for (auto keyword : keyword_indices)
keywords.unchecked_append(s_string_list[keyword]); keywords.unchecked_append(decode_string(keyword));
return keywords; return keywords;
} }
@ -1673,10 +1673,10 @@ Optional<ListPatterns> get_locale_list_patterns(StringView locale, StringView li
auto const& list_patterns = s_list_patterns.at(list_patterns_index); auto const& list_patterns = s_list_patterns.at(list_patterns_index);
if ((list_patterns.type == type_value) && (list_patterns.style == list_pattern_style)) { if ((list_patterns.type == type_value) && (list_patterns.style == list_pattern_style)) {
auto const& start = s_string_list[list_patterns.start]; auto const& start = decode_string(list_patterns.start);
auto const& middle = s_string_list[list_patterns.middle]; auto const& middle = decode_string(list_patterns.middle);
auto const& end = s_string_list[list_patterns.end]; auto const& end = decode_string(list_patterns.end);
auto const& pair = s_string_list[list_patterns.pair]; auto const& pair = decode_string(list_patterns.pair);
return ListPatterns { start, middle, end, pair }; return ListPatterns { start, middle, end, pair };
} }
@ -1707,9 +1707,9 @@ Optional<CharacterOrder> character_order_for_locale(StringView locale)
void resolve_complex_language_aliases(LanguageID& language_id) void resolve_complex_language_aliases(LanguageID& language_id)
{ {
for (auto const& map : s_complex_alias) { for (auto const& map : s_complex_alias) {
auto const& key_language = s_string_list[map.key.language]; auto const& key_language = decode_string(map.key.language);
auto const& key_script = s_string_list[map.key.script]; auto const& key_script = decode_string(map.key.script);
auto const& key_region = s_string_list[map.key.region]; auto const& key_region = decode_string(map.key.region);
if ((key_language != language_id.language) && (key_language != "und"sv)) if ((key_language != language_id.language) && (key_language != "und"sv))
continue; continue;
@ -1745,12 +1745,12 @@ Optional<LanguageID> add_likely_subtags(LanguageID const& language_id)
auto maximized = language_id; auto maximized = language_id;
auto const& key_script = s_string_list[likely_subtag->key.script]; auto const& key_script = decode_string(likely_subtag->key.script);
auto const& key_region = s_string_list[likely_subtag->key.region]; auto const& key_region = decode_string(likely_subtag->key.region);
auto const& alias_language = s_string_list[likely_subtag->alias.language]; auto const& alias_language = decode_string(likely_subtag->alias.language);
auto const& alias_script = s_string_list[likely_subtag->alias.script]; auto const& alias_script = decode_string(likely_subtag->alias.script);
auto const& alias_region = s_string_list[likely_subtag->alias.region]; auto const& alias_region = decode_string(likely_subtag->alias.region);
if (maximized.language == "und"sv) if (maximized.language == "und"sv)
maximized.language = alias_language; maximized.language = alias_language;
@ -1765,7 +1765,7 @@ Optional<LanguageID> add_likely_subtags(LanguageID const& language_id)
Optional<String> resolve_most_likely_territory(LanguageID const& language_id) Optional<String> resolve_most_likely_territory(LanguageID const& language_id)
{ {
if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr) if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr)
return s_string_list[likely_subtag->alias.region]; return decode_string(likely_subtag->alias.region);
return {}; return {};
} }

View file

@ -825,13 +825,13 @@ struct NumberFormatImpl {
number_format.magnitude = magnitude; number_format.magnitude = magnitude;
number_format.exponent = exponent; number_format.exponent = exponent;
number_format.plurality = static_cast<PluralCategory>(plurality); number_format.plurality = static_cast<PluralCategory>(plurality);
number_format.zero_format = s_string_list[zero_format]; number_format.zero_format = decode_string(zero_format);
number_format.positive_format = s_string_list[positive_format]; number_format.positive_format = decode_string(positive_format);
number_format.negative_format = s_string_list[negative_format]; number_format.negative_format = decode_string(negative_format);
number_format.identifiers.ensure_capacity(identifiers.size()); number_format.identifiers.ensure_capacity(identifiers.size());
for (@string_index_type@ identifier : identifiers) for (@string_index_type@ identifier : identifiers)
number_format.identifiers.append(s_string_list[identifier]); number_format.identifiers.append(decode_string(identifier));
return number_format; return number_format;
} }
@ -996,7 +996,7 @@ Optional<StringView> get_number_system_symbol(StringView locale, StringView syst
if (symbol_index >= symbols.size()) if (symbol_index >= symbols.size())
return {}; return {};
return s_string_list[symbols[symbol_index]]; return decode_string(symbols[symbol_index]);
} }
return {}; return {};
@ -1088,7 +1088,7 @@ static Unit const* find_units(StringView locale, StringView unit)
for (auto unit_index : locale_units) { for (auto unit_index : locale_units) {
auto const& units = s_units.at(unit_index); auto const& units = s_units.at(unit_index);
if (unit == s_string_list[units.unit]) if (unit == decode_string(units.unit))
return &units; return &units;
}; };

View file

@ -218,7 +218,7 @@ struct RelativeTimeFormatImpl {
{ {
RelativeTimeFormat relative_time_format {}; RelativeTimeFormat relative_time_format {};
relative_time_format.plurality = plurality; relative_time_format.plurality = plurality;
relative_time_format.pattern = s_string_list[pattern]; relative_time_format.pattern = decode_string(pattern);
return relative_time_format; return relative_time_format;
} }
@ -271,7 +271,7 @@ Vector<RelativeTimeFormat> get_relative_time_format_patterns(StringView locale,
continue; continue;
if (locale_format.style != style) if (locale_format.style != style)
continue; continue;
if (s_string_list[locale_format.tense_or_number] != tense_or_number) if (decode_string(locale_format.tense_or_number) != tense_or_number)
continue; continue;
formats.append(locale_format.to_relative_time_format()); formats.append(locale_format.to_relative_time_format());

View file

@ -11,6 +11,7 @@
#include <AK/HashMap.h> #include <AK/HashMap.h>
#include <AK/JsonValue.h> #include <AK/JsonValue.h>
#include <AK/LexicalPath.h> #include <AK/LexicalPath.h>
#include <AK/NumericLimits.h>
#include <AK/Optional.h> #include <AK/Optional.h>
#include <AK/QuickSort.h> #include <AK/QuickSort.h>
#include <AK/SourceGenerator.h> #include <AK/SourceGenerator.h>
@ -174,7 +175,7 @@ static constexpr Array<Span<@type@ const>, @size@ + 1> @name@ { {
// clang-format off // clang-format off
// clang-format gets confused by the requires() clauses above, and formats this section very weirdly. // clang-format gets confused by the requires() clauses above, and formats this section very weirdly.
private: protected:
Vector<StorageType> m_storage; Vector<StorageType> m_storage;
HashMap<StorageType, IndexType> m_storage_indices; HashMap<StorageType, IndexType> m_storage_indices;
// clang-format on // clang-format on
@ -185,9 +186,87 @@ class UniqueStringStorage : public UniqueStorage<String, StringIndexType> {
using Base = UniqueStorage<String, StringIndexType>; using Base = UniqueStorage<String, StringIndexType>;
public: public:
// The goal of the string table generator is to ensure the table is located within the read-only
// section of the shared library. If StringViews are generated directly, the table will be located
// in the initialized data section. So instead, we generate run-length encoded (RLE) arrays to
// represent the strings.
void generate(SourceGenerator& generator) void generate(SourceGenerator& generator)
{ {
Base::generate(generator, "StringView"sv, "s_string_list"sv, 40); constexpr size_t max_values_per_row = 300;
size_t values_in_current_row = 0;
auto append_hex_value = [&](auto value) {
if (values_in_current_row++ > 0)
generator.append(", ");
generator.append(String::formatted("{:#x}", value));
if (values_in_current_row == max_values_per_row) {
values_in_current_row = 0;
generator.append(",\n ");
}
};
Vector<u32> string_indices;
string_indices.ensure_capacity(Base::m_storage.size());
u32 next_index { 0 };
for (auto const& string : Base::m_storage) {
// Ensure the string length may be encoded as two u8s.
VERIFY(string.length() <= NumericLimits<u16>::max());
string_indices.unchecked_append(next_index);
next_index += string.length() + 2;
}
generator.set("size", String::number(next_index));
generator.append(R"~~~(
static constexpr Array<u8, @size@> s_encoded_strings { {
)~~~");
for (auto const& string : Base::m_storage) {
auto length = string.length();
append_hex_value((length & 0xff00) >> 8);
append_hex_value(length & 0x00ff);
for (auto ch : string)
append_hex_value(static_cast<u8>(ch));
}
generator.append(R"~~~(
} };
)~~~");
generator.set("size", String::number(string_indices.size()));
generator.append(R"~~~(
static constexpr Array<u32, @size@> s_encoded_string_indices { {
)~~~");
values_in_current_row = 0;
for (auto index : string_indices)
append_hex_value(index);
generator.append(R"~~~(
} };
static constexpr StringView decode_string(size_t index)
{
if (index == 0)
return {};
index = s_encoded_string_indices[index - 1];
auto length_high = s_encoded_strings[index];
auto length_low = s_encoded_strings[index + 1];
size_t length = (length_high << 8) | length_low;
if (length == 0)
return {};
auto const* start = &s_encoded_strings[index + 2];
return { reinterpret_cast<char const*>(start), length };
}
)~~~");
} }
}; };