From 9f837749137038fe7cbd23b0b7e7e963b9f3148e Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sat, 9 Oct 2021 18:52:37 -0400 Subject: [PATCH] LibUnicode: Generate standalone compile-time array for special casing There are only 112 code points with special casing rules, so this array is quite small (compared to the size 34,626 UnicodeData hash map that is also storing this data). Removing all casing rules from UnicodeData will happen in a subsequent commit. --- .../LibUnicode/GenerateUnicodeData.cpp | 64 +++++++++++++++---- .../Libraries/LibUnicode/CharacterTypes.cpp | 22 ++----- 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index b24a8ecb32e..06127b272a2 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -92,6 +92,7 @@ struct UnicodeData { u32 simple_lowercase_mapping_size { 0 }; Vector special_casing; + u32 code_points_with_special_casing { 0 }; u32 largest_casing_transform_size { 0 }; u32 largest_special_casing_size { 0 }; Vector conditions; @@ -428,14 +429,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) assigned_code_point_range_start = data.code_point; } + bool has_special_casing { false }; + for (auto const& casing : unicode_data.special_casing) { - if (casing.code_point == data.code_point) + if (casing.code_point == data.code_point) { data.special_casing_indices.append(casing.index); + has_special_casing = true; + } } unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); + unicode_data.code_points_with_special_casing += has_special_casing; unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); previous_code_point = data.code_point; @@ -490,6 +496,7 @@ enum class @name@ : @underlying@ {)~~~"); #pragma once #include +#include #include #include #include @@ -559,6 +566,7 @@ Optional unicode_data_for_code_point(u32 code_point); u32 simple_uppercase_mapping(u32 code_point); u32 simple_lowercase_mapping(u32 code_point); +Span special_case_mapping(u32 code_point); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); Optional general_category_from_string(StringView const& general_category); @@ -583,6 +591,7 @@ static void generate_unicode_data_implementation(Core::File& file, UnicodeData c StringBuilder builder; SourceGenerator generator { builder }; + generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size)); generator.set("special_casing_size", String::number(unicode_data.special_casing.size())); generator.set("code_point_data_size", String::number(unicode_data.code_point_data.size())); @@ -679,20 +688,28 @@ struct CodePointMapping { u32 mapping { 0 }; }; +struct SpecialCaseMapping { + u32 code_point { 0 }; + Array special_casing {}; + u32 special_casing_size { 0 }; +}; + +template struct CodePointComparator { - constexpr int operator()(u32 code_point, CodePointMapping const& mapping) + constexpr int operator()(u32 code_point, MappingType const& mapping) { return code_point - mapping.code_point; } }; )~~~"); - auto append_code_point_mappings = [&](StringView name, u32 size, auto mapping_getter) { + auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) { generator.set("name", name); + generator.set("mapping_type", mapping_type); generator.set("size", String::number(size)); generator.append(R"~~~( -static constexpr Array s_@name@_mappings { { +static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { )~~~"); constexpr size_t max_mappings_per_row = 20; @@ -700,15 +717,28 @@ static constexpr Array s_@name@_mappings { { for (auto const& data : unicode_data.code_point_data) { auto mapping = mapping_getter(data); - if (!mapping.has_value()) - continue; + + if constexpr (IsSame>) { + if (!mapping.has_value()) + continue; + } else { + if (mapping.is_empty()) + continue; + } if (mappings_in_current_row++ > 0) generator.append(" "); generator.set("code_point", String::formatted("{:#x}", data.code_point)); - generator.set("mapping", String::formatted("{:#x}", *mapping)); - generator.append("{ @code_point@, @mapping@ },"); + generator.append("{ @code_point@"); + + if constexpr (IsSame>) { + generator.set("mapping", String::formatted("{:#x}", *mapping)); + generator.append(", @mapping@ },"); + } else { + append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); + generator.append(" },"); + } if (mappings_in_current_row == max_mappings_per_row) { mappings_in_current_row = 0; @@ -720,8 +750,9 @@ static constexpr Array s_@name@_mappings { { )~~~"); }; - append_code_point_mappings("uppercase"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); - append_code_point_mappings("lowercase"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); + append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); + append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); + append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); generator.append(R"~~~( struct CodePointRange { @@ -852,7 +883,7 @@ Optional unicode_data_for_code_point(u32 code_point) generator.append(R"~~~( u32 @method@(u32 code_point) { - auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); + auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); return mapping ? mapping->mapping : code_point; } )~~~"); @@ -861,6 +892,17 @@ u32 @method@(u32 code_point) append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv); append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv); + generator.append(R"~~~( +Span special_case_mapping(u32 code_point) +{ + auto const* mapping = binary_search(s_special_case_mappings, code_point, nullptr, CodePointComparator {}); + if (mapping == nullptr) + return {}; + + return mapping->special_casing.span().slice(0, mapping->special_casing_size); +} +)~~~"); + auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) { generator.set("enum_title", enum_title); generator.set("enum_snake", enum_snake); diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 0398c92bf31..83350e8a751 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -159,7 +159,7 @@ static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t in return false; } -static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional locale, size_t index, size_t byte_length, UnicodeData const& unicode_data) +static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) { auto requested_locale = Locale::None; @@ -168,9 +168,9 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O requested_locale = *maybe_locale; } - for (size_t i = 0; i < unicode_data.special_casing_size; ++i) { - auto const* special_casing = unicode_data.special_casing[i]; + auto special_casings = Detail::special_case_mapping(code_point); + for (auto const* special_casing : special_casings) { if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) continue; @@ -241,13 +241,7 @@ String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Opti u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) { - builder.append_code_point(code_point); - continue; - } - - auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); + auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length); if (!special_casing) { builder.append_code_point(to_unicode_lowercase(code_point)); continue; @@ -276,13 +270,7 @@ String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Opti u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) { - builder.append_code_point(code_point); - continue; - } - - auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); + auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length); if (!special_casing) { builder.append_code_point(to_unicode_uppercase(code_point)); continue;