diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index b24a8ecb32e..06127b272a2 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -92,6 +92,7 @@ struct UnicodeData { u32 simple_lowercase_mapping_size { 0 }; Vector special_casing; + u32 code_points_with_special_casing { 0 }; u32 largest_casing_transform_size { 0 }; u32 largest_special_casing_size { 0 }; Vector conditions; @@ -428,14 +429,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) assigned_code_point_range_start = data.code_point; } + bool has_special_casing { false }; + for (auto const& casing : unicode_data.special_casing) { - if (casing.code_point == data.code_point) + if (casing.code_point == data.code_point) { data.special_casing_indices.append(casing.index); + has_special_casing = true; + } } unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); + unicode_data.code_points_with_special_casing += has_special_casing; unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); previous_code_point = data.code_point; @@ -490,6 +496,7 @@ enum class @name@ : @underlying@ {)~~~"); #pragma once #include +#include #include #include #include @@ -559,6 +566,7 @@ Optional unicode_data_for_code_point(u32 code_point); u32 simple_uppercase_mapping(u32 code_point); u32 simple_lowercase_mapping(u32 code_point); +Span special_case_mapping(u32 code_point); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); Optional general_category_from_string(StringView const& general_category); @@ -583,6 +591,7 @@ static void generate_unicode_data_implementation(Core::File& file, UnicodeData c StringBuilder builder; SourceGenerator generator { builder }; + generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size)); generator.set("special_casing_size", String::number(unicode_data.special_casing.size())); generator.set("code_point_data_size", String::number(unicode_data.code_point_data.size())); @@ -679,20 +688,28 @@ struct CodePointMapping { u32 mapping { 0 }; }; +struct SpecialCaseMapping { + u32 code_point { 0 }; + Array special_casing {}; + u32 special_casing_size { 0 }; +}; + +template struct CodePointComparator { - constexpr int operator()(u32 code_point, CodePointMapping const& mapping) + constexpr int operator()(u32 code_point, MappingType const& mapping) { return code_point - mapping.code_point; } }; )~~~"); - auto append_code_point_mappings = [&](StringView name, u32 size, auto mapping_getter) { + auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) { generator.set("name", name); + generator.set("mapping_type", mapping_type); generator.set("size", String::number(size)); generator.append(R"~~~( -static constexpr Array s_@name@_mappings { { +static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { )~~~"); constexpr size_t max_mappings_per_row = 20; @@ -700,15 +717,28 @@ static constexpr Array s_@name@_mappings { { for (auto const& data : unicode_data.code_point_data) { auto mapping = mapping_getter(data); - if (!mapping.has_value()) - continue; + + if constexpr (IsSame>) { + if (!mapping.has_value()) + continue; + } else { + if (mapping.is_empty()) + continue; + } if (mappings_in_current_row++ > 0) generator.append(" "); generator.set("code_point", String::formatted("{:#x}", data.code_point)); - generator.set("mapping", String::formatted("{:#x}", *mapping)); - generator.append("{ @code_point@, @mapping@ },"); + generator.append("{ @code_point@"); + + if constexpr (IsSame>) { + generator.set("mapping", String::formatted("{:#x}", *mapping)); + generator.append(", @mapping@ },"); + } else { + append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); + generator.append(" },"); + } if (mappings_in_current_row == max_mappings_per_row) { mappings_in_current_row = 0; @@ -720,8 +750,9 @@ static constexpr Array s_@name@_mappings { { )~~~"); }; - append_code_point_mappings("uppercase"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); - append_code_point_mappings("lowercase"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); + append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); + append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); + append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); generator.append(R"~~~( struct CodePointRange { @@ -852,7 +883,7 @@ Optional unicode_data_for_code_point(u32 code_point) generator.append(R"~~~( u32 @method@(u32 code_point) { - auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); + auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); return mapping ? mapping->mapping : code_point; } )~~~"); @@ -861,6 +892,17 @@ u32 @method@(u32 code_point) append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv); append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv); + generator.append(R"~~~( +Span special_case_mapping(u32 code_point) +{ + auto const* mapping = binary_search(s_special_case_mappings, code_point, nullptr, CodePointComparator {}); + if (mapping == nullptr) + return {}; + + return mapping->special_casing.span().slice(0, mapping->special_casing_size); +} +)~~~"); + auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) { generator.set("enum_title", enum_title); generator.set("enum_snake", enum_snake); diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 0398c92bf31..83350e8a751 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -159,7 +159,7 @@ static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t in return false; } -static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional locale, size_t index, size_t byte_length, UnicodeData const& unicode_data) +static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) { auto requested_locale = Locale::None; @@ -168,9 +168,9 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O requested_locale = *maybe_locale; } - for (size_t i = 0; i < unicode_data.special_casing_size; ++i) { - auto const* special_casing = unicode_data.special_casing[i]; + auto special_casings = Detail::special_case_mapping(code_point); + for (auto const* special_casing : special_casings) { if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) continue; @@ -241,13 +241,7 @@ String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Opti u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) { - builder.append_code_point(code_point); - continue; - } - - auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); + auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length); if (!special_casing) { builder.append_code_point(to_unicode_lowercase(code_point)); continue; @@ -276,13 +270,7 @@ String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Opti u32 code_point = *it; byte_length = it.underlying_code_point_length_in_bytes(); - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) { - builder.append_code_point(code_point); - continue; - } - - auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); + auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length); if (!special_casing) { builder.append_code_point(to_unicode_uppercase(code_point)); continue;