diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 8afbbdc5b4a..65aa4ca5d2e 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -464,4 +464,26 @@ TEST_CASE(script_extension) EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek)); EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek)); + + auto script_common = script("Common"sv); + auto script_zyyy = script("Zyyy"sv); + EXPECT_EQ(script_common, script_zyyy); + + EXPECT(Unicode::code_point_has_script(0x202f, script_common)); + EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common)); + + EXPECT(Unicode::code_point_has_script(0x3000, script_common)); + EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common)); + + auto script_inherited = script("Inherited"sv); + auto script_qaai = script("Qaai"sv); + auto script_zinh = script("Zinh"sv); + EXPECT_EQ(script_inherited, script_qaai); + EXPECT_EQ(script_inherited, script_zinh); + + EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited)); + EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited)); + + EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited)); + EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited)); } diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 0446e6cbf27..5d1c289336c 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -883,6 +883,33 @@ static void normalize_script_extensions(PropList& script_extensions, PropList co sort_and_merge_code_point_ranges(code_points); } + + // Lastly, the Common and Inherited script extensions are special. They must not contain any + // code points which appear in other script extensions. The ScriptExtensions UCD file does not + // list these extensions, therefore this peculiarity must be handled programatically. + // https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values + auto code_point_has_other_extension = [&](StringView key, u32 code_point) { + for (auto const& extension : extensions) { + if (extension.key == key) + continue; + if (any_of(extension.value, [&](auto const& r) { return (r.first <= code_point) && (code_point <= r.last); })) + return true; + } + + return false; + }; + + auto get_code_points_without_other_extensions = [&](StringView key) { + auto code_points = flatten_code_point_ranges(script_list.find(key)->value); + code_points.remove_all_matching([&](u32 c) { return code_point_has_other_extension(key, c); }); + return code_points; + }; + + auto common_code_points = get_code_points_without_other_extensions("Common"sv); + script_extensions.set("Common"sv, form_code_point_ranges(common_code_points)); + + auto inherited_code_points = get_code_points_without_other_extensions("Inherited"sv); + script_extensions.set("Inherited"sv, form_code_point_ranges(inherited_code_points)); } int main(int argc, char** argv)