/* * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include namespace Unicode { template struct PropertyName { Optional long_name; Optional short_name; Optional additional_name; }; // From uchar.h: // Unicode allows for additional names, beyond the long and short name, which would be indicated by U_LONG_PROPERTY_NAME + i static constexpr auto ADDITIONAL_NAME = static_cast(U_LONG_PROPERTY_NAME + 1); } template struct AK::Traits> { static constexpr bool equals(Unicode::PropertyName const& candidate, StringView property) { return property == candidate.long_name || property == candidate.short_name || property == candidate.additional_name; } }; namespace Unicode { static constexpr GeneralCategory GENERAL_CATEGORY_CASED_LETTER = U_CHAR_CATEGORY_COUNT + 1; static constexpr GeneralCategory GENERAL_CATEGORY_LETTER = U_CHAR_CATEGORY_COUNT + 2; static constexpr GeneralCategory GENERAL_CATEGORY_MARK = U_CHAR_CATEGORY_COUNT + 3; static constexpr GeneralCategory GENERAL_CATEGORY_NUMBER = U_CHAR_CATEGORY_COUNT + 4; static constexpr GeneralCategory GENERAL_CATEGORY_PUNCTUATION = U_CHAR_CATEGORY_COUNT + 5; static constexpr GeneralCategory GENERAL_CATEGORY_SYMBOL = U_CHAR_CATEGORY_COUNT + 6; static constexpr GeneralCategory GENERAL_CATEGORY_SEPARATOR = U_CHAR_CATEGORY_COUNT + 7; static constexpr GeneralCategory GENERAL_CATEGORY_OTHER = U_CHAR_CATEGORY_COUNT + 8; static constexpr GeneralCategory GENERAL_CATEGORY_LIMIT = U_CHAR_CATEGORY_COUNT + 9; Optional general_category_from_string(StringView general_category) { static auto general_category_names = []() { Array, GENERAL_CATEGORY_LIMIT.value()> names; auto set_names = [&](auto property, auto index, auto general_category) { if (char const* name = u_getPropertyValueName(property, general_category, U_LONG_PROPERTY_NAME)) names[index.value()].long_name = StringView { name, strlen(name) }; if (char const* name = u_getPropertyValueName(property, general_category, U_SHORT_PROPERTY_NAME)) names[index.value()].short_name = StringView { name, strlen(name) }; if (char const* name = u_getPropertyValueName(property, general_category, ADDITIONAL_NAME)) names[index.value()].additional_name = StringView { name, strlen(name) }; }; for (GeneralCategory general_category = 0; general_category < U_CHAR_CATEGORY_COUNT; ++general_category) set_names(UCHAR_GENERAL_CATEGORY, general_category, static_cast(general_category.value())); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_CASED_LETTER, U_GC_LC_MASK); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_LETTER, U_GC_L_MASK); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_MARK, U_GC_M_MASK); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_NUMBER, U_GC_N_MASK); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_PUNCTUATION, U_GC_P_MASK); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SYMBOL, U_GC_S_MASK); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SEPARATOR, U_GC_Z_MASK); set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_OTHER, U_GC_C_MASK); return names; }(); if (auto index = find_index(general_category_names.begin(), general_category_names.end(), general_category); index != general_category_names.size()) return static_cast(index); return {}; } bool code_point_has_general_category(u32 code_point, GeneralCategory general_category) { auto icu_code_point = static_cast(code_point); auto icu_general_category = static_cast(general_category.value()); if (general_category == GENERAL_CATEGORY_CASED_LETTER) return (U_GET_GC_MASK(icu_code_point) & U_GC_LC_MASK) != 0; if (general_category == GENERAL_CATEGORY_LETTER) return (U_GET_GC_MASK(icu_code_point) & U_GC_L_MASK) != 0; if (general_category == GENERAL_CATEGORY_MARK) return (U_GET_GC_MASK(icu_code_point) & U_GC_M_MASK) != 0; if (general_category == GENERAL_CATEGORY_NUMBER) return (U_GET_GC_MASK(icu_code_point) & U_GC_N_MASK) != 0; if (general_category == GENERAL_CATEGORY_PUNCTUATION) return (U_GET_GC_MASK(icu_code_point) & U_GC_P_MASK) != 0; if (general_category == GENERAL_CATEGORY_SYMBOL) return (U_GET_GC_MASK(icu_code_point) & U_GC_S_MASK) != 0; if (general_category == GENERAL_CATEGORY_SEPARATOR) return (U_GET_GC_MASK(icu_code_point) & U_GC_Z_MASK) != 0; if (general_category == GENERAL_CATEGORY_OTHER) return (U_GET_GC_MASK(icu_code_point) & U_GC_C_MASK) != 0; return u_charType(icu_code_point) == icu_general_category; } bool code_point_is_printable(u32 code_point) { return static_cast(u_isprint(static_cast(code_point))); } bool code_point_has_control_general_category(u32 code_point) { return code_point_has_general_category(code_point, U_CONTROL_CHAR); } bool code_point_has_letter_general_category(u32 code_point) { return code_point_has_general_category(code_point, GENERAL_CATEGORY_LETTER); } bool code_point_has_number_general_category(u32 code_point) { return code_point_has_general_category(code_point, GENERAL_CATEGORY_NUMBER); } bool code_point_has_punctuation_general_category(u32 code_point) { return code_point_has_general_category(code_point, GENERAL_CATEGORY_PUNCTUATION); } bool code_point_has_separator_general_category(u32 code_point) { return code_point_has_general_category(code_point, GENERAL_CATEGORY_SEPARATOR); } bool code_point_has_space_separator_general_category(u32 code_point) { return code_point_has_general_category(code_point, U_SPACE_SEPARATOR); } bool code_point_has_symbol_general_category(u32 code_point) { return code_point_has_general_category(code_point, GENERAL_CATEGORY_SYMBOL); } static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1; static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2; static constexpr Property PROPERTY_ASSIGNED = UCHAR_BINARY_LIMIT + 3; static constexpr Property PROPERTY_LIMIT = UCHAR_BINARY_LIMIT + 4; Optional property_from_string(StringView property) { static auto property_names = []() { Array, PROPERTY_LIMIT.value()> names; for (Property property = 0; property < UCHAR_BINARY_LIMIT; ++property) { auto icu_property = static_cast(property.value()); if (char const* name = u_getPropertyName(icu_property, U_LONG_PROPERTY_NAME)) names[property.value()].long_name = StringView { name, strlen(name) }; if (char const* name = u_getPropertyName(icu_property, U_SHORT_PROPERTY_NAME)) names[property.value()].short_name = StringView { name, strlen(name) }; if (char const* name = u_getPropertyName(icu_property, ADDITIONAL_NAME)) names[property.value()].additional_name = StringView { name, strlen(name) }; } names[PROPERTY_ANY.value()] = { "Any"sv, {}, {} }; names[PROPERTY_ASCII.value()] = { "ASCII"sv, {}, {} }; names[PROPERTY_ASSIGNED.value()] = { "Assigned"sv, {}, {} }; return names; }(); if (auto index = find_index(property_names.begin(), property_names.end(), property); index != property_names.size()) return static_cast(index); return {}; } bool code_point_has_property(u32 code_point, Property property) { auto icu_code_point = static_cast(code_point); auto icu_property = static_cast(property.value()); if (property == PROPERTY_ANY) return is_unicode(code_point); if (property == PROPERTY_ASCII) return is_ascii(code_point); if (property == PROPERTY_ASSIGNED) return u_isdefined(icu_code_point); return static_cast(u_hasBinaryProperty(icu_code_point, icu_property)); } bool code_point_has_emoji_property(u32 code_point) { return code_point_has_property(code_point, UCHAR_EMOJI); } bool code_point_has_emoji_modifier_base_property(u32 code_point) { return code_point_has_property(code_point, UCHAR_EMOJI_MODIFIER_BASE); } bool code_point_has_emoji_presentation_property(u32 code_point) { return code_point_has_property(code_point, UCHAR_EMOJI_PRESENTATION); } bool code_point_has_identifier_start_property(u32 code_point) { return u_isIDStart(static_cast(code_point)); } bool code_point_has_identifier_continue_property(u32 code_point) { return u_isIDPart(static_cast(code_point)); } bool code_point_has_regional_indicator_property(u32 code_point) { return code_point_has_property(code_point, UCHAR_REGIONAL_INDICATOR); } bool code_point_has_variation_selector_property(u32 code_point) { return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR); } bool code_point_has_white_space_property(u32 code_point) { return code_point_has_property(code_point, UCHAR_WHITE_SPACE); } // https://tc39.es/ecma262/#table-binary-unicode-properties bool is_ecma262_property(Property property) { if (property == PROPERTY_ANY || property == PROPERTY_ASCII || property == PROPERTY_ASSIGNED) return true; switch (property.value()) { case UCHAR_ASCII_HEX_DIGIT: case UCHAR_ALPHABETIC: case UCHAR_BIDI_CONTROL: case UCHAR_BIDI_MIRRORED: case UCHAR_CASE_IGNORABLE: case UCHAR_CASED: case UCHAR_CHANGES_WHEN_CASEFOLDED: case UCHAR_CHANGES_WHEN_CASEMAPPED: case UCHAR_CHANGES_WHEN_LOWERCASED: case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: case UCHAR_CHANGES_WHEN_TITLECASED: case UCHAR_CHANGES_WHEN_UPPERCASED: case UCHAR_DASH: case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: case UCHAR_DEPRECATED: case UCHAR_DIACRITIC: case UCHAR_EMOJI: case UCHAR_EMOJI_COMPONENT: case UCHAR_EMOJI_MODIFIER: case UCHAR_EMOJI_MODIFIER_BASE: case UCHAR_EMOJI_PRESENTATION: case UCHAR_EXTENDED_PICTOGRAPHIC: case UCHAR_EXTENDER: case UCHAR_GRAPHEME_BASE: case UCHAR_GRAPHEME_EXTEND: case UCHAR_HEX_DIGIT: case UCHAR_IDS_BINARY_OPERATOR: case UCHAR_IDS_TRINARY_OPERATOR: case UCHAR_ID_CONTINUE: case UCHAR_ID_START: case UCHAR_IDEOGRAPHIC: case UCHAR_JOIN_CONTROL: case UCHAR_LOGICAL_ORDER_EXCEPTION: case UCHAR_LOWERCASE: case UCHAR_MATH: case UCHAR_NONCHARACTER_CODE_POINT: case UCHAR_PATTERN_SYNTAX: case UCHAR_PATTERN_WHITE_SPACE: case UCHAR_QUOTATION_MARK: case UCHAR_RADICAL: case UCHAR_REGIONAL_INDICATOR: case UCHAR_S_TERM: case UCHAR_SOFT_DOTTED: case UCHAR_TERMINAL_PUNCTUATION: case UCHAR_UNIFIED_IDEOGRAPH: case UCHAR_UPPERCASE: case UCHAR_VARIATION_SELECTOR: case UCHAR_WHITE_SPACE: case UCHAR_XID_CONTINUE: case UCHAR_XID_START: return true; default: return false; } } Optional