LibUnicode: Generate code point abbreviations

This commit is contained in:
Timothy Flynn 2022-01-18 08:29:47 -05:00 committed by Linus Groh
parent 444b2d9ec2
commit 701b7810ba
Notes: sideshowbarker 2024-07-17 20:40:36 +09:00
3 changed files with 33 additions and 6 deletions

View file

@ -72,6 +72,7 @@ struct CodePointName {
struct CodePointData { struct CodePointData {
u32 code_point { 0 }; u32 code_point { 0 };
String name; String name;
Optional<StringView> abbreviation;
u8 canonical_combining_class { 0 }; u8 canonical_combining_class { 0 };
String bidi_class; String bidi_class;
String decomposition_type; String decomposition_type;
@ -101,6 +102,7 @@ struct UnicodeData {
Vector<CodePointData> code_point_data; Vector<CodePointData> code_point_data;
HashMap<u32, String> code_point_abbreviations;
HashMap<u32, String> code_point_display_name_aliases; HashMap<u32, String> code_point_display_name_aliases;
Vector<CodePointName> code_point_display_names; Vector<CodePointName> code_point_display_names;
@ -302,11 +304,12 @@ static void parse_name_aliases(Core::File& file, UnicodeData& unicode_data)
auto alias = segments[1].trim_whitespace(); auto alias = segments[1].trim_whitespace();
auto reason = segments[2].trim_whitespace(); auto reason = segments[2].trim_whitespace();
if (!reason.is_one_of("correction"sv, "control"sv)) if (reason == "abbreviation"sv) {
continue; unicode_data.code_point_abbreviations.set(*code_point, alias);
} else if (reason.is_one_of("correction"sv, "control"sv)) {
if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) if (!unicode_data.code_point_display_name_aliases.contains(*code_point))
unicode_data.code_point_display_name_aliases.set(*code_point, alias); unicode_data.code_point_display_name_aliases.set(*code_point, alias);
}
} }
} }
@ -476,6 +479,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]); data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]); data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value())
data.abbreviation = *abbreviation;
if (!assigned_code_point_range_start.has_value()) if (!assigned_code_point_range_start.has_value())
assigned_code_point_range_start = data.code_point; assigned_code_point_range_start = data.code_point;
@ -683,6 +689,11 @@ struct SpecialCaseMapping {
u32 special_casing_size { 0 }; u32 special_casing_size { 0 };
}; };
struct CodePointAbbreviation {
u32 code_point { 0 };
StringView abbreviation {};
};
template<typename MappingType> template<typename MappingType>
struct CodePointComparator { struct CodePointComparator {
constexpr int operator()(u32 code_point, MappingType const& mapping) constexpr int operator()(u32 code_point, MappingType const& mapping)
@ -707,7 +718,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
for (auto const& data : unicode_data.code_point_data) { for (auto const& data : unicode_data.code_point_data) {
auto mapping = mapping_getter(data); auto mapping = mapping_getter(data);
if constexpr (IsSame<decltype(mapping), Optional<u32>>) { if constexpr (requires { mapping.has_value(); }) {
if (!mapping.has_value()) if (!mapping.has_value())
continue; continue;
} else { } else {
@ -724,6 +735,9 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
if constexpr (IsSame<decltype(mapping), Optional<u32>>) { if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
generator.set("mapping", String::formatted("{:#x}", *mapping)); generator.set("mapping", String::formatted("{:#x}", *mapping));
generator.append(", @mapping@ },"); generator.append(", @mapping@ },");
} else if constexpr (IsSame<decltype(mapping), Optional<StringView>>) {
generator.set("mapping", String::formatted("{}", *mapping));
generator.append(", \"@mapping@\"sv },");
} else { } else {
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
generator.append(" },"); generator.append(" },");
@ -748,6 +762,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
generator.append(R"~~~( generator.append(R"~~~(
struct CodePointRange { struct CodePointRange {
@ -891,6 +906,15 @@ Span<SpecialCasing const* const> special_case_mapping(u32 code_point)
return mapping->special_casing.span().slice(0, mapping->special_casing_size); return mapping->special_casing.span().slice(0, mapping->special_casing_size);
} }
Optional<StringView> code_point_abbreviation(u32 code_point)
{
auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
if (mapping == nullptr)
return {};
return mapping->abbreviation;
}
)~~~"); )~~~");
auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) { auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {

View file

@ -22,6 +22,7 @@
namespace Unicode { namespace Unicode {
Optional<String> __attribute__((weak)) code_point_display_name(u32) { return {}; } Optional<String> __attribute__((weak)) code_point_display_name(u32) { return {}; }
Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; } u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
Span<SpecialCasing const* const> __attribute__((weak)) special_case_mapping(u32) { return {}; } Span<SpecialCasing const* const> __attribute__((weak)) special_case_mapping(u32) { return {}; }

View file

@ -16,6 +16,8 @@
namespace Unicode { namespace Unicode {
Optional<String> code_point_display_name(u32 code_point); Optional<String> code_point_display_name(u32 code_point);
Optional<StringView> code_point_abbreviation(u32 code_point);
u32 canonical_combining_class(u32 code_point); u32 canonical_combining_class(u32 code_point);
Span<SpecialCasing const* const> special_case_mapping(u32 code_point); Span<SpecialCasing const* const> special_case_mapping(u32 code_point);