mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-26 01:20:25 +00:00
LibUnicode: Generate code point abbreviations
This commit is contained in:
parent
444b2d9ec2
commit
701b7810ba
Notes:
sideshowbarker
2024-07-17 20:40:36 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/701b7810ba0 Pull-request: https://github.com/SerenityOS/serenity/pull/11993
3 changed files with 33 additions and 6 deletions
|
@ -72,6 +72,7 @@ struct CodePointName {
|
||||||
struct CodePointData {
|
struct CodePointData {
|
||||||
u32 code_point { 0 };
|
u32 code_point { 0 };
|
||||||
String name;
|
String name;
|
||||||
|
Optional<StringView> abbreviation;
|
||||||
u8 canonical_combining_class { 0 };
|
u8 canonical_combining_class { 0 };
|
||||||
String bidi_class;
|
String bidi_class;
|
||||||
String decomposition_type;
|
String decomposition_type;
|
||||||
|
@ -101,6 +102,7 @@ struct UnicodeData {
|
||||||
|
|
||||||
Vector<CodePointData> code_point_data;
|
Vector<CodePointData> code_point_data;
|
||||||
|
|
||||||
|
HashMap<u32, String> code_point_abbreviations;
|
||||||
HashMap<u32, String> code_point_display_name_aliases;
|
HashMap<u32, String> code_point_display_name_aliases;
|
||||||
Vector<CodePointName> code_point_display_names;
|
Vector<CodePointName> code_point_display_names;
|
||||||
|
|
||||||
|
@ -302,11 +304,12 @@ static void parse_name_aliases(Core::File& file, UnicodeData& unicode_data)
|
||||||
auto alias = segments[1].trim_whitespace();
|
auto alias = segments[1].trim_whitespace();
|
||||||
auto reason = segments[2].trim_whitespace();
|
auto reason = segments[2].trim_whitespace();
|
||||||
|
|
||||||
if (!reason.is_one_of("correction"sv, "control"sv))
|
if (reason == "abbreviation"sv) {
|
||||||
continue;
|
unicode_data.code_point_abbreviations.set(*code_point, alias);
|
||||||
|
} else if (reason.is_one_of("correction"sv, "control"sv)) {
|
||||||
if (!unicode_data.code_point_display_name_aliases.contains(*code_point))
|
if (!unicode_data.code_point_display_name_aliases.contains(*code_point))
|
||||||
unicode_data.code_point_display_name_aliases.set(*code_point, alias);
|
unicode_data.code_point_display_name_aliases.set(*code_point, alias);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -476,6 +479,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
||||||
data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
|
data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
|
||||||
data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
|
data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
|
||||||
|
|
||||||
|
if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value())
|
||||||
|
data.abbreviation = *abbreviation;
|
||||||
|
|
||||||
if (!assigned_code_point_range_start.has_value())
|
if (!assigned_code_point_range_start.has_value())
|
||||||
assigned_code_point_range_start = data.code_point;
|
assigned_code_point_range_start = data.code_point;
|
||||||
|
|
||||||
|
@ -683,6 +689,11 @@ struct SpecialCaseMapping {
|
||||||
u32 special_casing_size { 0 };
|
u32 special_casing_size { 0 };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct CodePointAbbreviation {
|
||||||
|
u32 code_point { 0 };
|
||||||
|
StringView abbreviation {};
|
||||||
|
};
|
||||||
|
|
||||||
template<typename MappingType>
|
template<typename MappingType>
|
||||||
struct CodePointComparator {
|
struct CodePointComparator {
|
||||||
constexpr int operator()(u32 code_point, MappingType const& mapping)
|
constexpr int operator()(u32 code_point, MappingType const& mapping)
|
||||||
|
@ -707,7 +718,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
||||||
for (auto const& data : unicode_data.code_point_data) {
|
for (auto const& data : unicode_data.code_point_data) {
|
||||||
auto mapping = mapping_getter(data);
|
auto mapping = mapping_getter(data);
|
||||||
|
|
||||||
if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
|
if constexpr (requires { mapping.has_value(); }) {
|
||||||
if (!mapping.has_value())
|
if (!mapping.has_value())
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
|
@ -724,6 +735,9 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
||||||
if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
|
if constexpr (IsSame<decltype(mapping), Optional<u32>>) {
|
||||||
generator.set("mapping", String::formatted("{:#x}", *mapping));
|
generator.set("mapping", String::formatted("{:#x}", *mapping));
|
||||||
generator.append(", @mapping@ },");
|
generator.append(", @mapping@ },");
|
||||||
|
} else if constexpr (IsSame<decltype(mapping), Optional<StringView>>) {
|
||||||
|
generator.set("mapping", String::formatted("{}", *mapping));
|
||||||
|
generator.append(", \"@mapping@\"sv },");
|
||||||
} else {
|
} else {
|
||||||
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
|
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
|
||||||
generator.append(" },");
|
generator.append(" },");
|
||||||
|
@ -748,6 +762,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
||||||
append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
|
append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
|
||||||
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
|
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
|
||||||
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
|
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
|
||||||
|
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
|
||||||
|
|
||||||
generator.append(R"~~~(
|
generator.append(R"~~~(
|
||||||
struct CodePointRange {
|
struct CodePointRange {
|
||||||
|
@ -891,6 +906,15 @@ Span<SpecialCasing const* const> special_case_mapping(u32 code_point)
|
||||||
|
|
||||||
return mapping->special_casing.span().slice(0, mapping->special_casing_size);
|
return mapping->special_casing.span().slice(0, mapping->special_casing_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Optional<StringView> code_point_abbreviation(u32 code_point)
|
||||||
|
{
|
||||||
|
auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
|
||||||
|
if (mapping == nullptr)
|
||||||
|
return {};
|
||||||
|
|
||||||
|
return mapping->abbreviation;
|
||||||
|
}
|
||||||
)~~~");
|
)~~~");
|
||||||
|
|
||||||
auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {
|
auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
namespace Unicode {
|
namespace Unicode {
|
||||||
|
|
||||||
Optional<String> __attribute__((weak)) code_point_display_name(u32) { return {}; }
|
Optional<String> __attribute__((weak)) code_point_display_name(u32) { return {}; }
|
||||||
|
Optional<StringView> __attribute__((weak)) code_point_abbreviation(u32) { return {}; }
|
||||||
u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
|
u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
|
||||||
Span<SpecialCasing const* const> __attribute__((weak)) special_case_mapping(u32) { return {}; }
|
Span<SpecialCasing const* const> __attribute__((weak)) special_case_mapping(u32) { return {}; }
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
namespace Unicode {
|
namespace Unicode {
|
||||||
|
|
||||||
Optional<String> code_point_display_name(u32 code_point);
|
Optional<String> code_point_display_name(u32 code_point);
|
||||||
|
Optional<StringView> code_point_abbreviation(u32 code_point);
|
||||||
|
|
||||||
u32 canonical_combining_class(u32 code_point);
|
u32 canonical_combining_class(u32 code_point);
|
||||||
Span<SpecialCasing const* const> special_case_mapping(u32 code_point);
|
Span<SpecialCasing const* const> special_case_mapping(u32 code_point);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue