LibUnicode: Generate standalone compile-time array for special casing

There are only 112 code points with special casing rules, so this array
is quite small (compared to the size 34,626 UnicodeData hash map that is
also storing this data). Removing all casing rules from UnicodeData will
happen in a subsequent commit.
This commit is contained in:
Timothy Flynn 2021-10-09 18:52:37 -04:00 committed by Andreas Kling
parent da4b8897a7
commit 9f83774913
Notes: sideshowbarker 2024-07-18 02:51:23 +09:00
2 changed files with 58 additions and 28 deletions

View file

@ -92,6 +92,7 @@ struct UnicodeData {
u32 simple_lowercase_mapping_size { 0 }; u32 simple_lowercase_mapping_size { 0 };
Vector<SpecialCasing> special_casing; Vector<SpecialCasing> special_casing;
u32 code_points_with_special_casing { 0 };
u32 largest_casing_transform_size { 0 }; u32 largest_casing_transform_size { 0 };
u32 largest_special_casing_size { 0 }; u32 largest_special_casing_size { 0 };
Vector<String> conditions; Vector<String> conditions;
@ -428,14 +429,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
assigned_code_point_range_start = data.code_point; assigned_code_point_range_start = data.code_point;
} }
bool has_special_casing { false };
for (auto const& casing : unicode_data.special_casing) { for (auto const& casing : unicode_data.special_casing) {
if (casing.code_point == data.code_point) if (casing.code_point == data.code_point) {
data.special_casing_indices.append(casing.index); data.special_casing_indices.append(casing.index);
has_special_casing = true;
}
} }
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
unicode_data.code_points_with_special_casing += has_special_casing;
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
previous_code_point = data.code_point; previous_code_point = data.code_point;
@ -490,6 +496,7 @@ enum class @name@ : @underlying@ {)~~~");
#pragma once #pragma once
#include <AK/Optional.h> #include <AK/Optional.h>
#include <AK/Span.h>
#include <AK/Types.h> #include <AK/Types.h>
#include <LibUnicode/Forward.h> #include <LibUnicode/Forward.h>
#include <LibUnicode/UnicodeLocale.h> #include <LibUnicode/UnicodeLocale.h>
@ -559,6 +566,7 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
u32 simple_uppercase_mapping(u32 code_point); u32 simple_uppercase_mapping(u32 code_point);
u32 simple_lowercase_mapping(u32 code_point); u32 simple_lowercase_mapping(u32 code_point);
Span<SpecialCasing const* const> special_case_mapping(u32 code_point);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
Optional<GeneralCategory> general_category_from_string(StringView const& general_category); Optional<GeneralCategory> general_category_from_string(StringView const& general_category);
@ -583,6 +591,7 @@ static void generate_unicode_data_implementation(Core::File& file, UnicodeData c
StringBuilder builder; StringBuilder builder;
SourceGenerator generator { builder }; SourceGenerator generator { builder };
generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size));
generator.set("special_casing_size", String::number(unicode_data.special_casing.size())); generator.set("special_casing_size", String::number(unicode_data.special_casing.size()));
generator.set("code_point_data_size", String::number(unicode_data.code_point_data.size())); generator.set("code_point_data_size", String::number(unicode_data.code_point_data.size()));
@ -679,20 +688,28 @@ struct CodePointMapping {
u32 mapping { 0 }; u32 mapping { 0 };
}; };
struct SpecialCaseMapping {
u32 code_point { 0 };
Array<SpecialCasing const*, @largest_special_casing_size@> special_casing {};
u32 special_casing_size { 0 };
};
template<typename MappingType>
struct CodePointComparator { struct CodePointComparator {
constexpr int operator()(u32 code_point, CodePointMapping const& mapping) constexpr int operator()(u32 code_point, MappingType const& mapping)
{ {
return code_point - mapping.code_point; return code_point - mapping.code_point;
} }
}; };
)~~~"); )~~~");
auto append_code_point_mappings = [&](StringView name, u32 size, auto mapping_getter) { auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) {
generator.set("name", name); generator.set("name", name);
generator.set("mapping_type", mapping_type);
generator.set("size", String::number(size)); generator.set("size", String::number(size));
generator.append(R"~~~( generator.append(R"~~~(
static constexpr Array<CodePointMapping, @size@> s_@name@_mappings { { static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
)~~~"); )~~~");
constexpr size_t max_mappings_per_row = 20; constexpr size_t max_mappings_per_row = 20;
@ -700,15 +717,28 @@ static constexpr Array<CodePointMapping, @size@> s_@name@_mappings { {
for (auto const& data : unicode_data.code_point_data) { for (auto const& data : unicode_data.code_point_data) {
auto mapping = mapping_getter(data); auto mapping = mapping_getter(data);
if (!mapping.has_value())
continue; if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
if (!mapping.has_value())
continue;
} else {
if (mapping.is_empty())
continue;
}
if (mappings_in_current_row++ > 0) if (mappings_in_current_row++ > 0)
generator.append(" "); generator.append(" ");
generator.set("code_point", String::formatted("{:#x}", data.code_point)); generator.set("code_point", String::formatted("{:#x}", data.code_point));
generator.set("mapping", String::formatted("{:#x}", *mapping)); generator.append("{ @code_point@");
generator.append("{ @code_point@, @mapping@ },");
if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
generator.set("mapping", String::formatted("{:#x}", *mapping));
generator.append(", @mapping@ },");
} else {
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
generator.append(" },");
}
if (mappings_in_current_row == max_mappings_per_row) { if (mappings_in_current_row == max_mappings_per_row) {
mappings_in_current_row = 0; mappings_in_current_row = 0;
@ -720,8 +750,9 @@ static constexpr Array<CodePointMapping, @size@> s_@name@_mappings { {
)~~~"); )~~~");
}; };
append_code_point_mappings("uppercase"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
append_code_point_mappings("lowercase"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
generator.append(R"~~~( generator.append(R"~~~(
struct CodePointRange { struct CodePointRange {
@ -852,7 +883,7 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
generator.append(R"~~~( generator.append(R"~~~(
u32 @method@(u32 code_point) u32 @method@(u32 code_point)
{ {
auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator<CodePointMapping> {});
return mapping ? mapping->mapping : code_point; return mapping ? mapping->mapping : code_point;
} }
)~~~"); )~~~");
@ -861,6 +892,17 @@ u32 @method@(u32 code_point)
append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv); append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv);
append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv); append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv);
generator.append(R"~~~(
Span<SpecialCasing const* const> special_case_mapping(u32 code_point)
{
auto const* mapping = binary_search(s_special_case_mappings, code_point, nullptr, CodePointComparator<SpecialCaseMapping> {});
if (mapping == nullptr)
return {};
return mapping->special_casing.span().slice(0, mapping->special_casing_size);
}
)~~~");
auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) { auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {
generator.set("enum_title", enum_title); generator.set("enum_title", enum_title);
generator.set("enum_snake", enum_snake); generator.set("enum_snake", enum_snake);

View file

@ -159,7 +159,7 @@ static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t in
return false; return false;
} }
static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data) static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
{ {
auto requested_locale = Locale::None; auto requested_locale = Locale::None;
@ -168,9 +168,9 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O
requested_locale = *maybe_locale; requested_locale = *maybe_locale;
} }
for (size_t i = 0; i < unicode_data.special_casing_size; ++i) { auto special_casings = Detail::special_case_mapping(code_point);
auto const* special_casing = unicode_data.special_casing[i];
for (auto const* special_casing : special_casings) {
if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
continue; continue;
@ -241,13 +241,7 @@ String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Opti
u32 code_point = *it; u32 code_point = *it;
byte_length = it.underlying_code_point_length_in_bytes(); byte_length = it.underlying_code_point_length_in_bytes();
auto unicode_data = Detail::unicode_data_for_code_point(code_point); auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length);
if (!unicode_data.has_value()) {
builder.append_code_point(code_point);
continue;
}
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
if (!special_casing) { if (!special_casing) {
builder.append_code_point(to_unicode_lowercase(code_point)); builder.append_code_point(to_unicode_lowercase(code_point));
continue; continue;
@ -276,13 +270,7 @@ String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Opti
u32 code_point = *it; u32 code_point = *it;
byte_length = it.underlying_code_point_length_in_bytes(); byte_length = it.underlying_code_point_length_in_bytes();
auto unicode_data = Detail::unicode_data_for_code_point(code_point); auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length);
if (!unicode_data.has_value()) {
builder.append_code_point(code_point);
continue;
}
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
if (!special_casing) { if (!special_casing) {
builder.append_code_point(to_unicode_uppercase(code_point)); builder.append_code_point(to_unicode_uppercase(code_point));
continue; continue;