Meta+LibUnicode: Avoid relocations for static unicode data

Previously the s_decomposition_mappings variable would refer to other
data in s_decomposition_mappings_data. This would cause thousands of
avoidable relocations at load time.

This saves about 128kB RAM for each process which uses LibUnicode.
This commit is contained in:
Gunnar Beutner 2022-11-06 07:08:56 +01:00 committed by Gunnar Beutner
parent fb71df5cb1
commit 2d3567ee92
Notes: sideshowbarker 2024-07-17 04:44:10 +09:00
3 changed files with 26 additions and 12 deletions

View file

@ -768,6 +768,13 @@ struct SpecialCasing {
Condition condition { Condition::None };
};
struct CodePointDecompositionRaw {
u32 code_point { 0 };
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
size_t decomposition_index { 0 };
size_t decomposition_count { 0 };
};
struct CodePointDecomposition {
u32 code_point { 0 };
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
@ -947,7 +954,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
generator.set("tag", mapping->tag);
generator.set("start", String::number(mapping->decomposition_index));
generator.set("size", String::number(mapping->decomposition_size));
generator.append(", CompatibilityFormattingTag::@tag@, Span<u32 const> { s_decomposition_mappings_data.data() + @start@, @size@ } },");
generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },");
} else {
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
generator.append(" },");
@ -974,7 +981,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
append_code_point_mappings("decomposition"sv, "CodePointDecomposition"sv, unicode_data.code_points_with_decomposition_mapping,
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping,
[](auto const& data) {
return data.decomposition_mapping;
});
@ -1153,17 +1160,20 @@ Optional<StringView> code_point_abbreviation(u32 code_point)
return decode_string(mapping->abbreviation);
}
Optional<CodePointDecomposition const&> code_point_decomposition(u32 code_point)
Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point)
{
auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator<CodePointDecomposition> {});
auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator<CodePointDecompositionRaw> {});
if (mapping == nullptr)
return {};
return *mapping;
return CodePointDecomposition { mapping->code_point, mapping->tag, Span<u32 const> { s_decomposition_mappings_data.data() + mapping->decomposition_index, mapping->decomposition_count } };
}
Span<CodePointDecomposition const> code_point_decompositions()
Optional<CodePointDecomposition const> code_point_decomposition_by_index(size_t index)
{
return s_decomposition_mappings;
if (index >= s_decomposition_mappings.size())
return {};
auto const& mapping = s_decomposition_mappings[index];
return CodePointDecomposition { mapping.code_point, mapping.tag, Span<u32 const> { s_decomposition_mappings_data.data() + mapping.decomposition_index, mapping.decomposition_count } };
}
)~~~");

View file

@ -14,8 +14,8 @@
namespace Unicode {
Optional<CodePointDecomposition const&> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
Span<CodePointDecomposition const> __attribute__((weak)) code_point_decompositions() { return {}; }
Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition_by_index(size_t) { return {}; }
NormalizationForm normalization_form_from_string(StringView form)
{
@ -122,7 +122,11 @@ static u32 combine_code_points(u32 a, u32 b)
{
Array<u32, 2> const points { a, b };
// FIXME: Do something better than linear search to find reverse mappings.
for (auto const& mapping : Unicode::code_point_decompositions()) {
for (size_t index = 0;; ++index) {
auto mapping_maybe = Unicode::code_point_decomposition_by_index(index);
if (!mapping_maybe.has_value())
break;
auto& mapping = mapping_maybe.value();
if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) {
if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion))
continue;

View file

@ -15,8 +15,8 @@
namespace Unicode {
Optional<CodePointDecomposition const&> code_point_decomposition(u32 code_point);
Span<CodePointDecomposition const> code_point_decompositions();
Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point);
Optional<CodePointDecomposition const> code_point_decomposition_by_index(size_t index);
enum class NormalizationForm {
NFD,