LibUnicode: Generate standalone compile-time arrays for simple casing

Currently, all casing information (simple and special) are stored in a
compile-time array of size 34,626, then statically copied to a hash map
at runtime. In an effort to reduce the resulting memory usage, store the
simple casing rules in standalone compile-time arrays. The uppercase map
is size 1,450 and the lowercase map is size 1,433. Any code point not in
a map will implicitly have an identity mapping.
This commit is contained in:
Timothy Flynn 2021-10-09 18:12:57 -04:00 committed by Andreas Kling
parent 7defb893a9
commit da4b8897a7
Notes: sideshowbarker 2024-07-18 02:51:26 +09:00
2 changed files with 78 additions and 10 deletions
Meta/Lagom/Tools/CodeGenerators/LibUnicode
Userland/Libraries/LibUnicode

View file

@ -88,6 +88,9 @@ struct CodePointData {
};
struct UnicodeData {
u32 simple_uppercase_mapping_size { 0 };
u32 simple_lowercase_mapping_size { 0 };
Vector<SpecialCasing> special_casing;
u32 largest_casing_transform_size { 0 };
u32 largest_special_casing_size { 0 };
@ -430,6 +433,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
data.special_casing_indices.append(casing.index);
}
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
previous_code_point = data.code_point;
@ -551,6 +557,9 @@ namespace Detail {
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
u32 simple_uppercase_mapping(u32 code_point);
u32 simple_lowercase_mapping(u32 code_point);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
Optional<GeneralCategory> general_category_from_string(StringView const& general_category);
@ -665,6 +674,56 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
generator.append(R"~~~(
} };
struct CodePointMapping {
u32 code_point { 0 };
u32 mapping { 0 };
};
struct CodePointComparator {
constexpr int operator()(u32 code_point, CodePointMapping const& mapping)
{
return code_point - mapping.code_point;
}
};
)~~~");
auto append_code_point_mappings = [&](StringView name, u32 size, auto mapping_getter) {
generator.set("name", name);
generator.set("size", String::number(size));
generator.append(R"~~~(
static constexpr Array<CodePointMapping, @size@> s_@name@_mappings { {
)~~~");
constexpr size_t max_mappings_per_row = 20;
size_t mappings_in_current_row = 0;
for (auto const& data : unicode_data.code_point_data) {
auto mapping = mapping_getter(data);
if (!mapping.has_value())
continue;
if (mappings_in_current_row++ > 0)
generator.append(" ");
generator.set("code_point", String::formatted("{:#x}", data.code_point));
generator.set("mapping", String::formatted("{:#x}", *mapping));
generator.append("{ @code_point@, @mapping@ },");
if (mappings_in_current_row == max_mappings_per_row) {
mappings_in_current_row = 0;
generator.append("\n ");
}
}
generator.append(R"~~~(
} };
)~~~");
};
append_code_point_mappings("uppercase"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
append_code_point_mappings("lowercase"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
generator.append(R"~~~(
struct CodePointRange {
u32 first { 0 };
u32 last { 0 };
@ -787,6 +846,21 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
}
)~~~");
auto append_code_point_mapping_search = [&](StringView method, StringView mappings) {
generator.set("method", method);
generator.set("mappings", mappings);
generator.append(R"~~~(
u32 @method@(u32 code_point)
{
auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {});
return mapping ? mapping->mapping : code_point;
}
)~~~");
};
append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv);
append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv);
auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {
generator.set("enum_title", enum_title);
generator.set("enum_snake", enum_snake);

View file

@ -213,10 +213,7 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O
u32 to_unicode_lowercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (unicode_data.has_value())
return unicode_data->simple_lowercase_mapping;
return code_point;
return Detail::simple_lowercase_mapping(code_point);
#else
return AK::to_ascii_lowercase(code_point);
#endif
@ -225,10 +222,7 @@ u32 to_unicode_lowercase(u32 code_point)
u32 to_unicode_uppercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (unicode_data.has_value())
return unicode_data->simple_uppercase_mapping;
return code_point;
return Detail::simple_uppercase_mapping(code_point);
#else
return AK::to_ascii_uppercase(code_point);
#endif
@ -255,7 +249,7 @@ String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Opti
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
if (!special_casing) {
builder.append_code_point(unicode_data->simple_lowercase_mapping);
builder.append_code_point(to_unicode_lowercase(code_point));
continue;
}
@ -290,7 +284,7 @@ String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Opti
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
if (!special_casing) {
builder.append_code_point(unicode_data->simple_uppercase_mapping);
builder.append_code_point(to_unicode_uppercase(code_point));
continue;
}