3 lat temu · 2d50c08f34
--- a/Meta/CMake/unicode_data.cmake
+++ b/Meta/CMake/unicode_data.cmake
@@ -48,6 +48,15 @@ set(EMOJI_DATA_PATH "${UCD_PATH}/emoji-data.txt")
 
															 set(NORM_PROPS_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/DerivedNormalizationProps.txt")
														
 
															 set(NORM_PROPS_PATH "${UCD_PATH}/DerivedNormalizationProps.txt")
														
 
															+set(GRAPHEME_BREAK_PROP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/auxiliary/GraphemeBreakProperty.txt")
														
 
															+set(GRAPHEME_BREAK_PROP_PATH "${UCD_PATH}/GraphemeBreakProperty.txt")
														
 
															+
														
 
															+set(WORD_BREAK_PROP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/auxiliary/WordBreakProperty.txt")
														
 
															+set(WORD_BREAK_PROP_PATH "${UCD_PATH}/WordBreakProperty.txt")
														
 
															+
														
 
															+set(SENTENCE_BREAK_PROP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/auxiliary/SentenceBreakProperty.txt")
														
 
															+set(SENTENCE_BREAK_PROP_PATH "${UCD_PATH}/SentenceBreakProperty.txt")
														
 
															+
														
 
															 set(CLDR_ZIP_URL "https://github.com/unicode-org/cldr-json/releases/download/${CLDR_VERSION}/cldr-${CLDR_VERSION}-json-modern.zip")
														
 
															 set(CLDR_ZIP_PATH "${CLDR_PATH}/cldr.zip")
														
@@ -96,6 +105,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
 
															     download_file("${SCRIPT_EXTENSIONS_URL}" "${SCRIPT_EXTENSIONS_PATH}")
														
 
															     download_file("${EMOJI_DATA_URL}" "${EMOJI_DATA_PATH}")
														
 
															     download_file("${NORM_PROPS_URL}" "${NORM_PROPS_PATH}")
														
 
															+    download_file("${GRAPHEME_BREAK_PROP_URL}" "${GRAPHEME_BREAK_PROP_PATH}")
														
 
															+    download_file("${WORD_BREAK_PROP_URL}" "${WORD_BREAK_PROP_PATH}")
														
 
															+    download_file("${SENTENCE_BREAK_PROP_URL}" "${SENTENCE_BREAK_PROP_PATH}")
														
 
															     download_file("${CLDR_ZIP_URL}" "${CLDR_ZIP_PATH}")
														
 
															     extract_cldr_file("${CLDR_CORE_SOURCE}" "${CLDR_CORE_PATH}")
														
@@ -148,7 +160,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
 
															         "${UNICODE_META_TARGET_PREFIX}"
														
 
															         "${UNICODE_DATA_HEADER}"
														
 
															         "${UNICODE_DATA_IMPLEMENTATION}"
														
 
															-        arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}"
														
 
															+        arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
														
 
															     )
														
 
															     invoke_generator(
														
 
															         "UnicodeDateTimeFormat"
														
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
@@ -127,6 +127,10 @@ struct UnicodeData {
 
															     // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
														
 
															     NormalizationProps normalization_props;
														
 
															+
														
 
															+    PropList grapheme_break_props;
														
 
															+    PropList word_break_props;
														
 
															+    PropList sentence_break_props;
														
 
															 };
														
 
															 static Vector<u32> parse_code_point_list(StringView list)
														
@@ -591,6 +595,9 @@ namespace Unicode {
 
															     generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
														
 
															     generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
														
 
															     generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
														
 
															+    generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
														
 
															+    generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
														
 
															+    generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
														
 
															     generator.append(R"~~~(
														
 
															 struct SpecialCasing {
														
@@ -837,6 +844,9 @@ static constexpr Array<Span<CodePointRange const>, @size@> @name@ { {)~~~");
 
															     append_prop_list("s_properties"sv, "s_property_{}"sv, unicode_data.prop_list);
														
 
															     append_prop_list("s_scripts"sv, "s_script_{}"sv, unicode_data.script_list);
														
 
															     append_prop_list("s_script_extensions"sv, "s_script_extension_{}"sv, unicode_data.script_extensions);
														
 
															+    append_prop_list("s_grapheme_break_properties"sv, "s_grapheme_break_property_{}"sv, unicode_data.grapheme_break_props);
														
 
															+    append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props);
														
 
															+    append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props);
														
 
															     generator.append(R"~~~(
														
 
															 struct CodePointName {
														
@@ -955,6 +965,10 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
 
															     append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv);
														
 
															     append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases);
														
 
															+    append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv);
														
 
															+    append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv);
														
 
															+    append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv);
														
 
															+
														
 
															     generator.append(R"~~~(
														
 
															 }
														
 
															 )~~~");
														
@@ -1104,6 +1118,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
 
															     StringView script_extensions_path;
														
 
															     StringView emoji_data_path;
														
 
															     StringView normalization_path;
														
 
															+    StringView grapheme_break_path;
														
 
															+    StringView word_break_path;
														
 
															+    StringView sentence_break_path;
														
 
															     Core::ArgsParser args_parser;
														
 
															     args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
														
@@ -1121,6 +1138,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
 
															     args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
														
 
															     args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
														
 
															     args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
														
 
															+    args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path");
														
 
															+    args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
														
 
															+    args_parser.add_option(sentence_break_path, "Path to SentenceBreakProperty.txt file", "sentence-break-path", 'i', "sentence-break-path");
														
 
															     args_parser.parse(arguments);
														
 
															     auto open_file = [&](StringView path, Core::OpenMode mode = Core::OpenMode::ReadOnly) -> ErrorOr<NonnullRefPtr<Core::File>> {
														
@@ -1147,6 +1167,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
 
															     auto script_extensions_file = TRY(open_file(script_extensions_path));
														
 
															     auto emoji_data_file = TRY(open_file(emoji_data_path));
														
 
															     auto normalization_file = TRY(open_file(normalization_path));
														
 
															+    auto grapheme_break_file = TRY(open_file(grapheme_break_path));
														
 
															+    auto word_break_file = TRY(open_file(word_break_path));
														
 
															+    auto sentence_break_file = TRY(open_file(sentence_break_path));
														
 
															     UnicodeData unicode_data {};
														
 
															     parse_special_casing(special_casing_file, unicode_data);
														
@@ -1160,6 +1183,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
 
															     parse_prop_list(scripts_file, unicode_data.script_list);
														
 
															     parse_prop_list(script_extensions_file, unicode_data.script_extensions, true);
														
 
															     parse_name_aliases(name_alias_file, unicode_data);
														
 
															+    parse_prop_list(grapheme_break_file, unicode_data.grapheme_break_props);
														
 
															+    parse_prop_list(word_break_file, unicode_data.word_break_props);
														
 
															+    parse_prop_list(sentence_break_file, unicode_data.sentence_break_props);
														
 
															     populate_general_category_unions(unicode_data.general_categories);
														
 
															     parse_unicode_data(unicode_data_file, unicode_data);
														
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -353,4 +353,8 @@ Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {
 
															 bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
														
 
															 bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
														
 
															+bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
														
 
															+bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
														
 
															+bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
														
 
															+
														
 
															 }
														
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -40,4 +40,8 @@ Optional<Script> script_from_string(StringView);
 
															 bool code_point_has_script(u32 code_point, Script script);
														
 
															 bool code_point_has_script_extension(u32 code_point, Script script);
														
 
															+bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakProperty property);
														
 
															+bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
														
 
															+bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
														
 
															+
														
 
															 }
														
--- a/Userland/Libraries/LibUnicode/Forward.h
+++ b/Userland/Libraries/LibUnicode/Forward.h
@@ -22,6 +22,7 @@ enum class DateField : u8;
 
															 enum class DayPeriod : u8;
														
 
															 enum class Era : u8;
														
 
															 enum class GeneralCategory : u8;
														
 
															+enum class GraphemeBreakProperty : u8;
														
 
															 enum class HourCycle : u8;
														
 
															 enum class HourCycleRegion : u8;
														
 
															 enum class Key : u8;
														
@@ -35,6 +36,7 @@ enum class NumericSymbol : u8;
 
															 enum class Property : u8;
														
 
															 enum class Script : u8;
														
 
															 enum class ScriptTag : u8;
														
 
															+enum class SentenceBreakProperty : u8;
														
 
															 enum class StandardNumberFormatType : u8;
														
 
															 enum class Style : u8;
														
 
															 enum class Territory : u8;