From 3fd53baa25f1d6b8522bcb2025bf7728cd3c1ad6 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 15 Dec 2021 18:42:51 -0500 Subject: [PATCH] LibUnicode: Dynamically load the generated UnicodeData symbols The generated data for libunicodedata.so is quite large, and loading it is a price paid by nearly every application by way of depending on LibRegex. In order to defer this cost until an application actually uses one of the surrounding APIs, dynamically load the generated symbols. To be able to load the symbols dynamically, the generated methods must have demangled names. Typically, this is accomplished with `extern "C"` blocks. The clang toolchain complains about this here because the types returned from the generators are strictly C++ types. So to demangle the names, we use the asm() compiler directive to manually define a symbol name; the caveat is that we *must* be sure the symbols are unique. As an extra precaution, we prefix each symbol name with "unicode_". For more details, see: https://gcc.gnu.org/onlinedocs/gcc/Asm-Labels.html This symbol loader used in this implementation provides the additional benefit of removing many [[maybe_unused]] attributes from the LibUnicode methods. Internally, if ENABLE_UNICODE_DATABASE_DOWNLOAD is OFF, the loader is able to stub out the function pointers it returns. Note that as of this commit, LibUnicode is still directly linked against LibUnicodeData. This commit is just a first step towards removing that. --- Meta/Lagom/CMakeLists.txt | 1 + .../LibUnicode/GenerateUnicodeData.cpp | 38 ++---- .../CodeGenerators/LibUnicode/GeneratorUtil.h | 54 ++++++++ Userland/Libraries/LibUnicode/CMakeLists.txt | 1 + .../Libraries/LibUnicode/CharacterTypes.cpp | 125 ++++++++---------- .../Libraries/LibUnicode/UnicodeSymbols.cpp | 94 +++++++++++++ .../Libraries/LibUnicode/UnicodeSymbols.h | 44 ++++++ 7 files changed, 256 insertions(+), 101 deletions(-) create mode 100644 Userland/Libraries/LibUnicode/UnicodeSymbols.cpp create mode 100644 Userland/Libraries/LibUnicode/UnicodeSymbols.h diff --git a/Meta/Lagom/CMakeLists.txt b/Meta/Lagom/CMakeLists.txt index 174110e07d4..6fe3516a013 100644 --- a/Meta/Lagom/CMakeLists.txt +++ b/Meta/Lagom/CMakeLists.txt @@ -427,6 +427,7 @@ if (BUILD_LAGOM) SOURCES ${LIBUNICODE_SOURCES} ${UNICODE_DATA_SOURCES} ) target_compile_definitions(LagomUnicode PRIVATE ENABLE_UNICODE_DATA=$) + target_link_libraries(LagomUnicode -ldl) # WASM file(GLOB LIBWASM_SOURCES CONFIGURE_DEPENDS "../../Userland/Libraries/LibWasm/*/*.cpp") diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index c34ecf4b102..53cc9fb2e1b 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -574,8 +574,6 @@ enum class @name@ : @underlying@ {)~~~"); generator.append(R"~~~( #pragma once -#include -#include #include #include #include @@ -605,28 +603,6 @@ struct SpecialCasing { Condition condition { Condition::None }; }; -namespace Detail { - -Optional code_point_display_name(u32 code_point); - -u32 canonical_combining_class(u32 code_point); - -u32 simple_uppercase_mapping(u32 code_point); -u32 simple_lowercase_mapping(u32 code_point); -Span special_case_mapping(u32 code_point); - -bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); -Optional general_category_from_string(StringView general_category); - -bool code_point_has_property(u32 code_point, Property property); -Optional property_from_string(StringView property); - -bool code_point_has_script(u32 code_point, Script script); -bool code_point_has_script_extension(u32 code_point, Script script); -Optional