LibUnicode: Replace code point general categories with ICU

This commit is contained in:
Timothy Flynn 2024-06-21 11:24:00 -04:00 committed by Andreas Kling
parent c804bda5fd
commit 986ff984cc
Notes: sideshowbarker 2024-07-17 16:23:06 +09:00
9 changed files with 90 additions and 68 deletions

View file

@ -13,9 +13,6 @@ set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip")
set(UNICODE_DATA_SOURCE "UnicodeData.txt")
set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}")
set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt")
set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}")
set(PROP_VALUE_ALIAS_SOURCE "PropertyValueAliases.txt")
set(PROP_VALUE_ALIAS_PATH "${UCD_PATH}/${PROP_VALUE_ALIAS_SOURCE}")
@ -39,7 +36,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
if (ENABLE_NETWORK_DOWNLOADS)
download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_VALUE_ALIAS_SOURCE}" "${PROP_VALUE_ALIAS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPTS_SOURCE}" "${SCRIPTS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}")
@ -67,7 +63,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UCD_VERSION_FILE}"
"${UNICODE_DATA_HEADER}"
"${UNICODE_DATA_IMPLEMENTATION}"
arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}"
arguments -u "${UNICODE_DATA_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}"
)
invoke_generator(
"EmojiData"

View file

@ -60,17 +60,12 @@ struct CodePointBidiClass {
struct UnicodeData {
Vector<CodePointData> code_point_data;
// https://www.unicode.org/reports/tr44/#General_Category_Values
PropList general_categories;
Vector<Alias> general_category_aliases;
PropList script_list {
{ "Unknown"sv, {} },
};
Vector<Alias> script_aliases;
PropList script_extensions;
CodePointTables<PropertyTable> general_category_tables;
CodePointTables<PropertyTable> script_tables;
CodePointTables<PropertyTable> script_extension_tables;
@ -290,7 +285,6 @@ enum class @name@ : @underlying@ {)~~~");
namespace Unicode {
)~~~");
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
@ -405,7 +399,6 @@ static constexpr Array<@type@, @size@> @name@ { {
return {};
};
TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table));
TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table));
@ -489,9 +482,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
return {};
};
TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv));
TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases));
TRY(append_prop_search("Script"sv, "script"sv, "s_scripts"sv));
TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv));
TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases));
@ -561,29 +551,6 @@ static void sort_and_merge_code_point_ranges(Vector<Unicode::CodePointRange>& co
code_points = form_code_point_ranges(all_code_points);
}
static void populate_general_category_unions(PropList& general_categories)
{
// The Unicode standard defines General Category values which are not in any UCD file. These
// values are simply unions of other values.
// https://www.unicode.org/reports/tr44/#GC_Values_Table
auto populate_union = [&](auto alias, auto categories) {
auto& code_points = general_categories.ensure(alias);
for (auto const& category : categories)
code_points.extend(general_categories.find(category)->value);
sort_and_merge_code_point_ranges(code_points);
};
populate_union("LC"sv, Array { "Ll"sv, "Lu"sv, "Lt"sv });
populate_union("L"sv, Array { "Lu"sv, "Ll"sv, "Lt"sv, "Lm"sv, "Lo"sv });
populate_union("M"sv, Array { "Mn"sv, "Mc"sv, "Me"sv });
populate_union("N"sv, Array { "Nd"sv, "Nl"sv, "No"sv });
populate_union("P"sv, Array { "Pc"sv, "Pd"sv, "Ps"sv, "Pe"sv, "Pi"sv, "Pf"sv, "Po"sv });
populate_union("S"sv, Array { "Sm"sv, "Sc"sv, "Sk"sv, "So"sv });
populate_union("Z"sv, Array { "Zs"sv, "Zl"sv, "Zp"sv });
populate_union("C"sv, Array { "Cc"sv, "Cf"sv, "Cs"sv, "Co"sv, "Cn"sv });
}
static ErrorOr<void> normalize_script_extensions(PropList& script_extensions, PropList const& script_list, Vector<Alias> const& script_aliases)
{
// The ScriptExtensions UCD file lays out its code point ranges rather uniquely compared to
@ -755,12 +722,10 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
return {};
};
auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories));
auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions));
for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata));
}
@ -773,7 +738,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
StringView generated_header_path;
StringView generated_implementation_path;
StringView unicode_data_path;
StringView derived_general_category_path;
StringView prop_value_alias_path;
StringView scripts_path;
StringView script_extensions_path;
@ -782,7 +746,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
@ -791,19 +754,15 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read));
auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read));
auto prop_value_alias_file = TRY(open_file(prop_value_alias_path, Core::File::OpenMode::Read));
auto scripts_file = TRY(open_file(scripts_path, Core::File::OpenMode::Read));
auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
UnicodeData unicode_data {};
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
populate_general_category_unions(unicode_data.general_categories);
TRY(parse_unicode_data(*unicode_data_file, unicode_data));
TRY(parse_value_alias_list(*prop_value_alias_file, "gc"sv, unicode_data.general_categories.keys(), unicode_data.general_category_aliases));
TRY(parse_value_alias_list(*prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), unicode_data.script_aliases, false));
TRY(normalize_script_extensions(unicode_data.script_extensions, unicode_data.script_list, unicode_data.script_aliases));

View file

@ -421,11 +421,7 @@ bool Lexer::is_whitespace() const
auto code_point = current_code_point();
if (code_point == NO_BREAK_SPACE || code_point == ZERO_WIDTH_NO_BREAK_SPACE)
return true;
static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv);
if (space_separator_category.has_value())
return Unicode::code_point_has_general_category(code_point, *space_separator_category);
return false;
return Unicode::code_point_has_space_separator_general_category(code_point);
}
// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence

View file

@ -780,15 +780,11 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
bool OpCode_Compare::matches_character_class(CharClass character_class, u32 ch, bool insensitive)
{
constexpr auto is_space_or_line_terminator = [](u32 code_point) {
static auto space_separator = Unicode::general_category_from_string("Space_Separator"sv);
if (!space_separator.has_value())
return is_ascii_space(code_point);
if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029))
return true;
if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff))
return true;
return Unicode::code_point_has_general_category(code_point, *space_separator);
return Unicode::code_point_has_space_separator_general_category(code_point);
};
switch (character_class) {

View file

@ -1647,7 +1647,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
},
[&](Script script) {
if (script.is_extension)
@ -1998,7 +1998,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
if (atom.is_property)
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property.value()) });
else if (atom.is_general_category)
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category.value()) });
else if (atom.is_script)
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
else if (atom.is_script_extension)
@ -2335,7 +2335,7 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
},
[&](Script script) {
if (script.is_extension)
@ -2422,7 +2422,7 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
},
[&](Script script) {
if (script.is_extension)

View file

@ -37,8 +37,84 @@ struct AK::Traits<Unicode::PropertyName<PropertyType>> {
namespace Unicode {
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
static constexpr GeneralCategory GENERAL_CATEGORY_CASED_LETTER = U_CHAR_CATEGORY_COUNT + 1;
static constexpr GeneralCategory GENERAL_CATEGORY_LETTER = U_CHAR_CATEGORY_COUNT + 2;
static constexpr GeneralCategory GENERAL_CATEGORY_MARK = U_CHAR_CATEGORY_COUNT + 3;
static constexpr GeneralCategory GENERAL_CATEGORY_NUMBER = U_CHAR_CATEGORY_COUNT + 4;
static constexpr GeneralCategory GENERAL_CATEGORY_PUNCTUATION = U_CHAR_CATEGORY_COUNT + 5;
static constexpr GeneralCategory GENERAL_CATEGORY_SYMBOL = U_CHAR_CATEGORY_COUNT + 6;
static constexpr GeneralCategory GENERAL_CATEGORY_SEPARATOR = U_CHAR_CATEGORY_COUNT + 7;
static constexpr GeneralCategory GENERAL_CATEGORY_OTHER = U_CHAR_CATEGORY_COUNT + 8;
static constexpr GeneralCategory GENERAL_CATEGORY_LIMIT = U_CHAR_CATEGORY_COUNT + 9;
Optional<GeneralCategory> general_category_from_string(StringView general_category)
{
static auto general_category_names = []() {
Array<PropertyName<GeneralCategory>, GENERAL_CATEGORY_LIMIT.value()> names;
auto set_names = [&](auto property, auto index, auto general_category) {
if (char const* name = u_getPropertyValueName(property, general_category, U_LONG_PROPERTY_NAME))
names[index.value()].long_name = StringView { name, strlen(name) };
if (char const* name = u_getPropertyValueName(property, general_category, U_SHORT_PROPERTY_NAME))
names[index.value()].short_name = StringView { name, strlen(name) };
if (char const* name = u_getPropertyValueName(property, general_category, ADDITIONAL_NAME))
names[index.value()].additional_name = StringView { name, strlen(name) };
};
for (GeneralCategory general_category = 0; general_category < U_CHAR_CATEGORY_COUNT; ++general_category)
set_names(UCHAR_GENERAL_CATEGORY, general_category, static_cast<UCharCategory>(general_category.value()));
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_CASED_LETTER, U_GC_LC_MASK);
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_LETTER, U_GC_L_MASK);
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_MARK, U_GC_M_MASK);
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_NUMBER, U_GC_N_MASK);
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_PUNCTUATION, U_GC_P_MASK);
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SYMBOL, U_GC_S_MASK);
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SEPARATOR, U_GC_Z_MASK);
set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_OTHER, U_GC_C_MASK);
return names;
}();
if (auto index = find_index(general_category_names.begin(), general_category_names.end(), general_category); index != general_category_names.size())
return static_cast<GeneralCategory>(index);
return {};
}
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category)
{
auto icu_code_point = static_cast<UChar32>(code_point);
auto icu_general_category = static_cast<UCharCategory>(general_category.value());
if (general_category == GENERAL_CATEGORY_CASED_LETTER)
return (U_GET_GC_MASK(icu_code_point) & U_GC_LC_MASK) != 0;
if (general_category == GENERAL_CATEGORY_LETTER)
return (U_GET_GC_MASK(icu_code_point) & U_GC_L_MASK) != 0;
if (general_category == GENERAL_CATEGORY_MARK)
return (U_GET_GC_MASK(icu_code_point) & U_GC_M_MASK) != 0;
if (general_category == GENERAL_CATEGORY_NUMBER)
return (U_GET_GC_MASK(icu_code_point) & U_GC_N_MASK) != 0;
if (general_category == GENERAL_CATEGORY_PUNCTUATION)
return (U_GET_GC_MASK(icu_code_point) & U_GC_P_MASK) != 0;
if (general_category == GENERAL_CATEGORY_SYMBOL)
return (U_GET_GC_MASK(icu_code_point) & U_GC_S_MASK) != 0;
if (general_category == GENERAL_CATEGORY_SEPARATOR)
return (U_GET_GC_MASK(icu_code_point) & U_GC_Z_MASK) != 0;
if (general_category == GENERAL_CATEGORY_OTHER)
return (U_GET_GC_MASK(icu_code_point) & U_GC_C_MASK) != 0;
return u_charType(icu_code_point) == icu_general_category;
}
bool code_point_has_control_general_category(u32 code_point)
{
return code_point_has_general_category(code_point, U_CONTROL_CHAR);
}
bool code_point_has_space_separator_general_category(u32 code_point)
{
return code_point_has_general_category(code_point, U_SPACE_SEPARATOR);
}
static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1;
static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2;

View file

@ -29,6 +29,9 @@ struct CodePointRangeComparator {
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
bool code_point_has_control_general_category(u32 code_point);
bool code_point_has_space_separator_general_category(u32 code_point);
Optional<Property> property_from_string(StringView);
bool code_point_has_property(u32 code_point, Property property);

View file

@ -13,12 +13,12 @@ namespace Unicode {
enum class BidirectionalClass : u8;
enum class EmojiGroup : u8;
enum class GeneralCategory : u8;
enum class Script : u8;
struct CurrencyCode;
struct Emoji;
AK_TYPEDEF_DISTINCT_NUMERIC_GENERAL(u32, GeneralCategory, CastToUnderlying, Comparison, Increment);
AK_TYPEDEF_DISTINCT_NUMERIC_GENERAL(u32, Property, CastToUnderlying, Comparison, Increment);
}

View file

@ -246,11 +246,7 @@ static ErrorOr<Optional<String>> get_event_key_string(u32 code_point)
auto is_non_control_character = [&]() {
// A non-control character is any valid Unicode character except those that are part of the "Other, Control"
// ("Cc") General Category.
static auto control_general_category = Unicode::general_category_from_string("Cc"sv);
if (!control_general_category.has_value())
return true;
return !Unicode::code_point_has_general_category(code_point, *control_general_category);
return !Unicode::code_point_has_control_general_category(code_point);
};
// A key string is a string containing a 0 or 1 non-control characters ("base" characters) followed by 0 or more