瀏覽代碼

LibUnicode: Add public methods to compare and lookup Unicode properties

Adds methods to retrieve a Unicode property from a string and to check
if a code point matches a Unicode property.

Also adds a <LibUnicode/Forward.h> header.
Timothy Flynn 4 年之前
父節點
當前提交
f1809db994

+ 32 - 8
Userland/Libraries/LibUnicode/CharacterTypes.cpp

@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/CharacterTypes.h>
 #include <AK/Platform.h>
 #include <AK/StringBuilder.h>
 #include <AK/Types.h>
@@ -12,8 +13,6 @@
 
 #if ENABLE_UNICODE_DATA
 #    include <LibUnicode/UnicodeData.h>
-#else
-#    include <AK/CharacterTypes.h>
 #endif
 
 // For details on the algorithms used here, see Section 3.13 Default Case Algorithms
@@ -41,7 +40,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
     size_t cased_letter_count = 0;
 
     for (auto code_point : preceding_view) {
-        auto unicode_data = unicode_data_for_code_point(code_point);
+        auto unicode_data = Detail::unicode_data_for_code_point(code_point);
         if (!unicode_data.has_value())
             return false;
 
@@ -58,7 +57,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
         return false;
 
     for (auto code_point : following_view) {
-        auto unicode_data = unicode_data_for_code_point(code_point);
+        auto unicode_data = Detail::unicode_data_for_code_point(code_point);
         if (!unicode_data.has_value())
             return false;
 
@@ -107,7 +106,7 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, s
 u32 to_unicode_lowercase(u32 code_point)
 {
 #if ENABLE_UNICODE_DATA
-    auto unicode_data = unicode_data_for_code_point(code_point);
+    auto unicode_data = Detail::unicode_data_for_code_point(code_point);
     if (unicode_data.has_value())
         return unicode_data->simple_lowercase_mapping;
     return code_point;
@@ -119,7 +118,7 @@ u32 to_unicode_lowercase(u32 code_point)
 u32 to_unicode_uppercase(u32 code_point)
 {
 #if ENABLE_UNICODE_DATA
-    auto unicode_data = unicode_data_for_code_point(code_point);
+    auto unicode_data = Detail::unicode_data_for_code_point(code_point);
     if (unicode_data.has_value())
         return unicode_data->simple_uppercase_mapping;
     return code_point;
@@ -139,7 +138,7 @@ String to_unicode_lowercase_full(StringView const& string)
         u32 code_point = *it;
         size_t byte_length = it.underlying_code_point_length_in_bytes();
 
-        auto unicode_data = unicode_data_for_code_point(code_point);
+        auto unicode_data = Detail::unicode_data_for_code_point(code_point);
         if (!unicode_data.has_value()) {
             builder.append_code_point(code_point);
             index += byte_length;
@@ -174,7 +173,7 @@ String to_unicode_uppercase_full(StringView const& string)
         u32 code_point = *it;
         size_t byte_length = it.underlying_code_point_length_in_bytes();
 
-        auto unicode_data = unicode_data_for_code_point(code_point);
+        auto unicode_data = Detail::unicode_data_for_code_point(code_point);
         if (!unicode_data.has_value()) {
             builder.append_code_point(code_point);
             index += byte_length;
@@ -198,4 +197,29 @@ String to_unicode_uppercase_full(StringView const& string)
 #endif
 }
 
+Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
+{
+#if ENABLE_UNICODE_DATA
+    return Detail::property_from_string(property);
+#else
+    return {};
+#endif
+}
+
+bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
+{
+#if ENABLE_UNICODE_DATA
+    if (property == Property::Any)
+        return is_unicode(code_point);
+
+    auto unicode_data = Detail::unicode_data_for_code_point(code_point);
+    if (!unicode_data.has_value())
+        return false;
+
+    return has_property(*unicode_data, property);
+#else
+    return false;
+#endif
+}
+
 }

+ 4 - 0
Userland/Libraries/LibUnicode/CharacterTypes.h

@@ -9,6 +9,7 @@
 #include <AK/Forward.h>
 #include <AK/String.h>
 #include <AK/Types.h>
+#include <LibUnicode/Forward.h>
 
 namespace Unicode {
 
@@ -20,4 +21,7 @@ u32 to_unicode_uppercase(u32 code_point);
 String to_unicode_lowercase_full(StringView const&);
 String to_unicode_uppercase_full(StringView const&);
 
+Optional<Property> property_from_string(StringView const&);
+bool code_point_has_property(u32 code_point, Property property);
+
 }

+ 34 - 1
Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp

@@ -404,6 +404,7 @@ constexpr @name@ operator|(@name@ value1, @name@ value2)
 
 #include <AK/Optional.h>
 #include <AK/Types.h>
+#include <LibUnicode/Forward.h>
 
 namespace Unicode {
 )~~~");
@@ -411,7 +412,7 @@ namespace Unicode {
     generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
     generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
     generate_enum("GeneralCategory"sv, {}, move(unicode_data.general_categories));
-    generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), move(unicode_data.prop_aliases), true);
+    generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true);
     generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
 
     generator.append(R"~~~(
@@ -469,7 +470,12 @@ struct UnicodeData {
     WordBreakProperty word_break_property { WordBreakProperty::Other };
 };
 
+namespace Detail {
+
 Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
+Optional<Property> property_from_string(StringView const& property);
+
+}
 
 })~~~");
 
@@ -489,6 +495,7 @@ static void generate_unicode_data_implementation(UnicodeData unicode_data)
 #include <AK/Array.h>
 #include <AK/CharacterTypes.h>
 #include <AK/Find.h>
+#include <AK/StringView.h>
 #include <LibUnicode/UnicodeData.h>
 
 namespace Unicode {
@@ -597,6 +604,8 @@ static Optional<u32> index_of_code_point_in_range(u32 code_point)
     return {};
 }
 
+namespace Detail {
+
 Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
 {
     VERIFY(is_unicode(code_point));
@@ -618,6 +627,30 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
     return {};
 }
 
+Optional<Property> property_from_string(StringView const& property)
+{
+    if (property == "Assigned"sv)
+        return Property::Assigned;)~~~");
+
+    for (auto const& property : unicode_data.prop_list) {
+        generator.set("property", property.key);
+        generator.append(R"~~~(
+    if (property == "@property@"sv)
+        return Property::@property@;)~~~");
+    }
+    for (auto const& alias : unicode_data.prop_aliases) {
+        generator.set("property", alias.alias);
+        generator.append(R"~~~(
+    if (property == "@property@"sv)
+        return Property::@property@;)~~~");
+    }
+
+    generator.append(R"~~~(
+    return {};
+}
+
+}
+
 })~~~");
 
     outln("{}", generator.as_string_view());

+ 22 - 0
Userland/Libraries/LibUnicode/Forward.h

@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Types.h>
+
+namespace Unicode {
+
+enum class Condition;
+enum class GeneralCategory;
+enum class Locale;
+enum class Property : u64;
+enum class WordBreakProperty;
+
+struct SpecialCasing;
+struct UnicodeData;
+
+}