LibUnicode: Generate data for bidirectional character types
This will let us examine code points to determine the rtl/ltr direction of a piece of text.
This commit is contained in:
parent
ef6133337e
commit
0d021a63c7
Notes:
sideshowbarker
2024-07-17 09:49:33 +09:00
Author: https://github.com/AtkinsSJ Commit: https://github.com/SerenityOS/serenity/commit/0d021a63c7 Pull-request: https://github.com/SerenityOS/serenity/pull/20532 Reviewed-by: https://github.com/trflynn89 ✅
5 changed files with 95 additions and 0 deletions
|
@ -125,6 +125,11 @@ struct CodePointTables {
|
|||
Vector<PropertyType> unique_properties;
|
||||
};
|
||||
|
||||
struct CodePointBidiClass {
|
||||
Unicode::CodePointRange code_point_range;
|
||||
DeprecatedString bidi_class;
|
||||
};
|
||||
|
||||
struct UnicodeData {
|
||||
UniqueStringStorage unique_strings;
|
||||
|
||||
|
@ -184,6 +189,9 @@ struct UnicodeData {
|
|||
CodePointTables<PropertyTable> grapheme_break_tables;
|
||||
CodePointTables<PropertyTable> word_break_tables;
|
||||
CodePointTables<PropertyTable> sentence_break_tables;
|
||||
|
||||
HashTable<DeprecatedString> bidirectional_classes;
|
||||
Vector<CodePointBidiClass> code_point_bidirectional_classes;
|
||||
};
|
||||
|
||||
static DeprecatedString sanitize_entry(DeprecatedString const& entry)
|
||||
|
@ -725,8 +733,10 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
|
|||
code_point_range_start.clear();
|
||||
|
||||
add_canonical_code_point_name(code_point_range, data.name, unicode_data);
|
||||
unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class });
|
||||
} else {
|
||||
add_canonical_code_point_name({ data.code_point, data.code_point }, data.name, unicode_data);
|
||||
unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class });
|
||||
|
||||
if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) {
|
||||
VERIFY(assigned_code_point_range_start.has_value());
|
||||
|
@ -748,6 +758,8 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
|
|||
|
||||
unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value();
|
||||
|
||||
unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
|
||||
|
||||
previous_code_point = data.code_point;
|
||||
unicode_data.code_point_data.append(move(data));
|
||||
}
|
||||
|
@ -818,6 +830,7 @@ namespace Unicode {
|
|||
generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
|
||||
generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
|
||||
generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags);
|
||||
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
|
||||
|
||||
generator.append(R"~~~(
|
||||
struct SpecialCasing {
|
||||
|
@ -1003,6 +1016,19 @@ struct CodePointNameComparator : public CodePointRangeComparator {
|
|||
return CodePointRangeComparator::operator()(code_point, name.code_point_range);
|
||||
}
|
||||
};
|
||||
|
||||
struct BidiClassData {
|
||||
CodePointRange code_point_range {};
|
||||
BidirectionalClass bidi_class {};
|
||||
};
|
||||
|
||||
struct CodePointBidiClassComparator : public CodePointRangeComparator {
|
||||
constexpr int operator()(u32 code_point, BidiClassData const& bidi_class)
|
||||
{
|
||||
return CodePointRangeComparator::operator()(code_point, bidi_class.code_point_range);
|
||||
}
|
||||
};
|
||||
|
||||
)~~~");
|
||||
|
||||
generator.set("decomposition_mappings_size", DeprecatedString::number(unicode_data.decomposition_mappings.size()));
|
||||
|
@ -1212,6 +1238,33 @@ static constexpr Array<@type@, @size@> @name@ { {
|
|||
append_code_point_display_names("BlockNameData"sv, "s_block_display_names"sv, unicode_data.block_display_names);
|
||||
append_code_point_display_names("CodePointName"sv, "s_code_point_display_names"sv, unicode_data.code_point_display_names);
|
||||
|
||||
{
|
||||
constexpr size_t max_bidi_classes_per_row = 20;
|
||||
size_t bidi_classes_in_current_row = 0;
|
||||
|
||||
generator.set("size"sv, DeprecatedString::number(unicode_data.code_point_bidirectional_classes.size()));
|
||||
generator.append(R"~~~(
|
||||
static constexpr Array<BidiClassData, @size@> s_bidirectional_classes { {
|
||||
)~~~");
|
||||
for (auto const& data : unicode_data.code_point_bidirectional_classes) {
|
||||
if (bidi_classes_in_current_row++ > 0)
|
||||
generator.append(", ");
|
||||
|
||||
generator.set("first", DeprecatedString::formatted("{:#x}", data.code_point_range.first));
|
||||
generator.set("last", DeprecatedString::formatted("{:#x}", data.code_point_range.last));
|
||||
generator.set("bidi_class", data.bidi_class);
|
||||
generator.append("{ { @first@, @last@ }, BidirectionalClass::@bidi_class@ }");
|
||||
|
||||
if (bidi_classes_in_current_row == max_bidi_classes_per_row) {
|
||||
bidi_classes_in_current_row = 0;
|
||||
generator.append(",\n ");
|
||||
}
|
||||
}
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
)~~~");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
Optional<StringView> code_point_block_display_name(u32 code_point)
|
||||
{
|
||||
|
@ -1335,6 +1388,14 @@ Optional<CodePointDecomposition const> code_point_decomposition_by_index(size_t
|
|||
auto const& mapping = s_decomposition_mappings[index];
|
||||
return CodePointDecomposition { mapping.code_point, mapping.tag, ReadonlySpan<u32> { s_decomposition_mappings_data.data() + mapping.decomposition_index, mapping.decomposition_count } };
|
||||
}
|
||||
|
||||
Optional<BidirectionalClass> bidirectional_class(u32 code_point)
|
||||
{
|
||||
if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {}))
|
||||
return entry->bidi_class;
|
||||
|
||||
return {};
|
||||
}
|
||||
)~~~");
|
||||
|
||||
auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) -> ErrorOr<void> {
|
||||
|
@ -1396,6 +1457,8 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
|
|||
TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv));
|
||||
TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv));
|
||||
|
||||
TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {}));
|
||||
|
||||
generator.append(R"~~~(
|
||||
}
|
||||
)~~~");
|
||||
|
|
|
@ -848,3 +848,28 @@ TEST_CASE(code_point_display_name)
|
|||
EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv);
|
||||
}
|
||||
|
||||
TEST_CASE(code_point_bidirectional_character_type)
|
||||
{
|
||||
auto code_point_bidi_class = [](u32 code_point) {
|
||||
auto bidi_class = Unicode::bidirectional_class(code_point);
|
||||
VERIFY(bidi_class.has_value());
|
||||
return bidi_class.release_value();
|
||||
};
|
||||
|
||||
auto bidi_class_from_string = [](StringView name) {
|
||||
auto result = Unicode::bidirectional_class_from_string(name);
|
||||
VERIFY(result.has_value());
|
||||
return result.release_value();
|
||||
};
|
||||
|
||||
// Left-to-right
|
||||
EXPECT_EQ(code_point_bidi_class('A'), bidi_class_from_string("L"sv));
|
||||
EXPECT_EQ(code_point_bidi_class('z'), bidi_class_from_string("L"sv));
|
||||
// European number
|
||||
EXPECT_EQ(code_point_bidi_class('7'), bidi_class_from_string("EN"sv));
|
||||
// Whitespace
|
||||
EXPECT_EQ(code_point_bidi_class(' '), bidi_class_from_string("WS"sv));
|
||||
// Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM)
|
||||
EXPECT_EQ(code_point_bidi_class(0xFEB4), bidi_class_from_string("AL"sv));
|
||||
}
|
||||
|
|
|
@ -147,4 +147,7 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB
|
|||
bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
|
||||
|
||||
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
|
||||
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
|
||||
|
||||
}
|
||||
|
|
|
@ -68,4 +68,7 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert
|
|||
bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
|
||||
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
|
||||
|
||||
Optional<BidirectionalClass> bidirectional_class_from_string(StringView);
|
||||
Optional<BidirectionalClass> bidirectional_class(u32 code_point);
|
||||
|
||||
}
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
namespace Unicode {
|
||||
|
||||
enum class BidirectionalClass : u8;
|
||||
enum class Block : u16;
|
||||
enum class EmojiGroup : u8;
|
||||
enum class GeneralCategory : u8;
|
||||
|
|
Loading…
Add table
Reference in a new issue