LibUnicode: Download and parse the word break property list UCD file

Note that unlike the main property list, each code point has only one
word break property. Code points that do not have a word break property
are to be assigned the property "Other".
This commit is contained in:
Timothy Flynn 2021-07-27 18:24:39 -04:00 committed by Andreas Kling
parent c45a014645
commit 12fb3ae033
Notes: sideshowbarker 2024-07-18 07:57:14 +09:00
2 changed files with 60 additions and 5 deletions

View file

@ -65,6 +65,7 @@ struct CodePointData {
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
Vector<StringView> prop_list;
StringView word_break_property;
};
struct UnicodeData {
@ -81,6 +82,8 @@ struct UnicodeData {
PropList prop_list;
u32 largest_prop_list_size { 0 };
PropList word_break_prop_list;
};
static constexpr auto s_desired_fields = Array {
@ -152,7 +155,7 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
quick_sort(unicode_data.conditions);
}
static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
static void parse_prop_list(Core::File& file, PropList& prop_list)
{
while (file.can_read_line()) {
auto line = file.read_line();
@ -169,7 +172,7 @@ static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
auto property = segments[1].trim_whitespace().to_string();
property.replace("_", "", true);
auto& code_points = unicode_data.prop_list.ensure(property);
auto& code_points = prop_list.ensure(property);
if (code_point_range.contains(".."sv)) {
segments = code_point_range.split_view(".."sv);
@ -253,6 +256,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
}
}
for (auto const& property : unicode_data.word_break_prop_list) {
for (auto const& range : property.value) {
if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
data.word_break_property = property.key;
break;
}
}
if (!data.word_break_property.is_empty())
break;
}
if (data.word_break_property.is_empty())
data.word_break_property = "Other"sv;
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
unicode_data.largest_prop_list_size = max(unicode_data.largest_prop_list_size, data.prop_list.size());
@ -333,6 +349,21 @@ enum class Property {)~~~");
generator.append(R"~~~(
};
enum class WordBreakProperty {
Other,)~~~");
properties = unicode_data.word_break_prop_list.keys();
quick_sort(properties);
for (auto const& property : properties) {
generator.set("property", property);
generator.append(R"~~~(
@property@,)~~~");
}
generator.append(R"~~~(
};
struct SpecialCasing {
u32 code_point { 0 };
@ -385,6 +416,8 @@ struct UnicodeData {
Property prop_list[@prop_list_size@] {};
u32 prop_list_size { 0 };
WordBreakProperty word_break_property { WordBreakProperty::Other };
};
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
@ -484,6 +517,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point)));
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
append_list_and_size(data.prop_list, "Property::{}"sv);
generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
generator.append(" },");
}
@ -540,6 +574,7 @@ int main(int argc, char** argv)
char const* unicode_data_path = nullptr;
char const* special_casing_path = nullptr;
char const* prop_list_path = nullptr;
char const* word_break_path = nullptr;
Core::ArgsParser args_parser;
args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h');
@ -547,6 +582,7 @@ int main(int argc, char** argv)
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
args_parser.parse(argc, argv);
if (!generate_header && !generate_implementation) {
@ -569,6 +605,11 @@ int main(int argc, char** argv)
args_parser.print_usage(stderr, argv[0]);
return 1;
}
if (!word_break_path) {
warnln("-w/--word-break-path is required");
args_parser.print_usage(stderr, argv[0]);
return 1;
}
auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly);
if (unicode_data_file_or_error.is_error()) {
@ -588,9 +629,16 @@ int main(int argc, char** argv)
return 1;
}
auto word_break_file_or_error = Core::File::open(word_break_path, Core::OpenMode::ReadOnly);
if (word_break_file_or_error.is_error()) {
warnln("Failed to open {}: {}", word_break_path, word_break_file_or_error.release_error());
return 1;
}
UnicodeData unicode_data {};
parse_special_casing(special_casing_file_or_error.value(), unicode_data);
parse_prop_list(prop_list_file_or_error.value(), unicode_data);
parse_prop_list(prop_list_file_or_error.value(), unicode_data.prop_list);
parse_prop_list(word_break_file_or_error.value(), unicode_data.word_break_prop_list);
parse_unicode_data(unicode_data_file_or_error.value(), unicode_data);
if (generate_header)

View file

@ -9,6 +9,9 @@ set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt)
set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt)
set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt)
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
if (NOT EXISTS ${UNICODE_DATA_PATH})
message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...")
@ -22,6 +25,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...")
file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10)
endif()
if (NOT EXISTS ${WORD_BREAK_PATH})
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
endif()
set(UNICODE_GENERATOR CodeGenerators/GenerateUnicodeData)
set(UNICODE_DATA_HEADER UnicodeData.h)
@ -39,7 +46,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
add_custom_command(
OUTPUT ${UNICODE_DATA_HEADER}
COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH}
COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH}
VERBATIM
DEPENDS GenerateUnicodeData
MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
@ -47,7 +54,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
add_custom_command(
OUTPUT ${UNICODE_DATA_IMPLEMENTATION}
COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH}
COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH}
VERBATIM
DEPENDS GenerateUnicodeData
MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}