/* * Copyright (c) 2021, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #include "GeneratorUtil.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // https://www.unicode.org/reports/tr44/#PropList.txt using PropList = HashMap>; // https://www.unicode.org/reports/tr44/#UnicodeData.txt struct CodePointData { u32 code_point { 0 }; ByteString name; ByteString bidi_class; Optional numeric_value_decimal; Optional numeric_value_digit; Optional numeric_value_numeric; bool bidi_mirrored { false }; ByteString unicode_1_name; ByteString iso_comment; }; struct CodePointBidiClass { Unicode::CodePointRange code_point_range; ByteString bidi_class; }; struct UnicodeData { Vector code_point_data; HashTable bidirectional_classes; Vector code_point_bidirectional_classes; }; static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data) { Optional code_point_range_start; Array buffer; while (TRY(file.can_read_line())) { auto line = TRY(file.read_line(buffer)); if (line.is_empty()) continue; auto segments = line.split_view(';', SplitBehavior::KeepEmpty); VERIFY(segments.size() == 15); CodePointData data {}; data.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); data.name = segments[1]; data.bidi_class = segments[4]; data.numeric_value_decimal = AK::StringUtils::convert_to_int(segments[6]); data.numeric_value_digit = AK::StringUtils::convert_to_int(segments[7]); data.numeric_value_numeric = AK::StringUtils::convert_to_int(segments[8]); data.bidi_mirrored = segments[9] == "Y"sv; data.unicode_1_name = segments[10]; data.iso_comment = segments[11]; if (data.name.starts_with("<"sv) && data.name.ends_with(", First>"sv)) { VERIFY(!code_point_range_start.has_value()); code_point_range_start = data.code_point; data.name = data.name.substring(1, data.name.length() - 9); } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>"sv)) { VERIFY(code_point_range_start.has_value()); Unicode::CodePointRange code_point_range { *code_point_range_start, data.code_point }; data.name = data.name.substring(1, data.name.length() - 8); code_point_range_start.clear(); unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class }); } else { unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class }); } unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep); unicode_data.code_point_data.append(move(data)); } return {}; } static ErrorOr generate_unicode_data_header(Core::InputBufferedFile& file, UnicodeData& unicode_data) { StringBuilder builder; SourceGenerator generator { builder }; auto generate_enum = [&](StringView name, StringView default_, auto values, Vector aliases = {}) { quick_sort(values); quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; }); generator.set("name", name); generator.set("underlying", ByteString::formatted("{}UnderlyingType", name)); generator.set("type", ((values.size() + !default_.is_empty()) < 256) ? "u8"sv : "u16"sv); generator.append(R"~~~( using @underlying@ = @type@; enum class @name@ : @underlying@ {)~~~"); if (!default_.is_empty()) { generator.set("default", default_); generator.append(R"~~~( @default@,)~~~"); } for (auto const& value : values) { generator.set("value", value); generator.append(R"~~~( @value@,)~~~"); } for (auto const& alias : aliases) { generator.set("alias", alias.alias); generator.set("value", alias.name); generator.append(R"~~~( @alias@ = @value@,)~~~"); } generator.append(R"~~~( }; )~~~"); }; generator.append(R"~~~( #pragma once #include #include namespace Unicode { )~~~"); generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values()); generator.append(R"~~~( } )~~~"); TRY(file.write_until_depleted(generator.as_string_view().bytes())); return {}; } static ErrorOr generate_unicode_data_implementation(Core::InputBufferedFile& file, UnicodeData const& unicode_data) { StringBuilder builder; SourceGenerator generator { builder }; generator.append(R"~~~( #include #include #include #include #include #include #include #include #include namespace Unicode { )~~~"); generator.append(R"~~~( struct BidiClassData { CodePointRange code_point_range {}; BidirectionalClass bidi_class {}; }; struct CodePointBidiClassComparator : public CodePointRangeComparator { constexpr int operator()(u32 code_point, BidiClassData const& bidi_class) { return CodePointRangeComparator::operator()(code_point, bidi_class.code_point_range); } }; )~~~"); { constexpr size_t max_bidi_classes_per_row = 20; size_t bidi_classes_in_current_row = 0; generator.set("size"sv, ByteString::number(unicode_data.code_point_bidirectional_classes.size())); generator.append(R"~~~( static constexpr Array s_bidirectional_classes { { )~~~"); for (auto const& data : unicode_data.code_point_bidirectional_classes) { if (bidi_classes_in_current_row++ > 0) generator.append(", "); generator.set("first", ByteString::formatted("{:#x}", data.code_point_range.first)); generator.set("last", ByteString::formatted("{:#x}", data.code_point_range.last)); generator.set("bidi_class", data.bidi_class); generator.append("{ { @first@, @last@ }, BidirectionalClass::@bidi_class@ }"); if (bidi_classes_in_current_row == max_bidi_classes_per_row) { bidi_classes_in_current_row = 0; generator.append(",\n "); } } generator.append(R"~~~( } }; )~~~"); } generator.append(R"~~~( Optional bidirectional_class(u32 code_point) { if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {})) return entry->bidi_class; return {}; } )~~~"); auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& prop_list, Vector const& aliases) -> ErrorOr { HashValueMap hashes; TRY(hashes.try_ensure_capacity(prop_list.size() + aliases.size())); ValueFromStringOptions options {}; for (auto const& prop : prop_list) { if constexpr (IsSame, ByteString>) { hashes.set(CaseInsensitiveASCIIStringViewTraits::hash(prop), prop); options.sensitivity = CaseSensitivity::CaseInsensitive; } else { hashes.set(prop.key.hash(), prop.key); } } for (auto const& alias : aliases) hashes.set(alias.alias.hash(), alias.alias); generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes), options); return {}; }; TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {})); generator.append(R"~~~( } )~~~"); TRY(file.write_until_depleted(generator.as_string_view().bytes())); return {}; } ErrorOr serenity_main(Main::Arguments arguments) { StringView generated_header_path; StringView generated_implementation_path; StringView unicode_data_path; Core::ArgsParser args_parser; args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); args_parser.parse(arguments); auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read)); UnicodeData unicode_data {}; TRY(parse_unicode_data(*unicode_data_file, unicode_data)); TRY(generate_unicode_data_header(*generated_header_file, unicode_data)); TRY(generate_unicode_data_implementation(*generated_implementation_file, unicode_data)); return 0; }