瀏覽代碼

LibUnicode: Implement locale-aware AFTER_I special casing

Timothy Flynn 3 年之前
父節點
當前提交
0053d48c41

+ 43 - 0
Tests/LibUnicode/TestUnicodeCharacterTypes.cpp

@@ -155,6 +155,49 @@ TEST_CASE(to_unicode_lowercase_special_casing_sigma)
     EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
     EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
 }
 }
 
 
+TEST_CASE(to_unicode_lowercase_special_casing_i)
+{
+    // LATIN CAPITAL LETTER I
+    auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
+    EXPECT_EQ(result, "i"sv);
+
+    result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv);
+    EXPECT_EQ(result, "\u0131"sv);
+
+    result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv);
+    EXPECT_EQ(result, "\u0131"sv);
+
+    // LATIN CAPITAL LETTER I WITH DOT ABOVE
+    result = Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv);
+    EXPECT_EQ(result, "\u0069\u0307"sv);
+
+    result = Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv);
+    EXPECT_EQ(result, "i"sv);
+
+    result = Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv);
+    EXPECT_EQ(result, "i"sv);
+
+    // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
+    result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv);
+    EXPECT_EQ(result, "i\u0307"sv);
+
+    result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv);
+    EXPECT_EQ(result, "i"sv);
+
+    result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv);
+    EXPECT_EQ(result, "i"sv);
+
+    // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
+    result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv);
+    EXPECT_EQ(result, "ia\u0307"sv);
+
+    result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv);
+    EXPECT_EQ(result, "\u0131a\u0307"sv);
+
+    result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv);
+    EXPECT_EQ(result, "\u0131a\u0307"sv);
+}
+
 TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
 TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
 {
 {
     // LATIN SMALL LETTER SHARP S
     // LATIN SMALL LETTER SHARP S

+ 48 - 10
Userland/Libraries/LibUnicode/CharacterTypes.cpp

@@ -10,6 +10,7 @@
 #include <AK/Types.h>
 #include <AK/Types.h>
 #include <AK/Utf8View.h>
 #include <AK/Utf8View.h>
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/CharacterTypes.h>
+#include <LibUnicode/Locale.h>
 
 
 #if ENABLE_UNICODE_DATA
 #if ENABLE_UNICODE_DATA
 #    include <LibUnicode/UnicodeData.h>
 #    include <LibUnicode/UnicodeData.h>
@@ -22,6 +23,32 @@ namespace Unicode {
 
 
 #if ENABLE_UNICODE_DATA
 #if ENABLE_UNICODE_DATA
 
 
+static bool is_after_uppercase_i(Utf8View const& string, size_t index)
+{
+    // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
+    auto preceding_view = string.substring_view(0, index);
+    bool found_uppercase_i = false;
+
+    // FIXME: Would be better if Utf8View supported reverse iteration.
+    for (auto code_point : preceding_view) {
+        if (code_point == 'I') {
+            found_uppercase_i = true;
+            continue;
+        }
+
+        auto unicode_data = Detail::unicode_data_for_code_point(code_point);
+        if (!unicode_data.has_value())
+            return false;
+
+        if (unicode_data->canonical_combining_class == 0)
+            found_uppercase_i = false;
+        else if (unicode_data->canonical_combining_class == 230)
+            found_uppercase_i = false;
+    }
+
+    return found_uppercase_i;
+}
+
 static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
 static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
 {
 {
     // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
     // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
@@ -62,19 +89,30 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
     return true;
     return true;
 }
 }
 
 
-static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
+static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data)
 {
 {
+    auto requested_locale = Locale::None;
+
+    if (locale.has_value()) {
+        if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
+            requested_locale = *maybe_locale;
+    }
+
     for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
     for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
         auto const* special_casing = unicode_data.special_casing[i];
         auto const* special_casing = unicode_data.special_casing[i];
 
 
-        if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
-            return special_casing;
-
-        // FIXME: Handle locale.
-        if (special_casing->locale != Locale::None)
+        if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
             continue;
             continue;
 
 
         switch (special_casing->condition) {
         switch (special_casing->condition) {
+        case Condition::None:
+            return special_casing;
+
+        case Condition::AfterI:
+            if (is_after_uppercase_i(string, index))
+                return special_casing;
+            break;
+
         case Condition::FinalSigma:
         case Condition::FinalSigma:
             if (is_final_code_point(string, index, byte_length))
             if (is_final_code_point(string, index, byte_length))
                 return special_casing;
                 return special_casing;
@@ -114,7 +152,7 @@ u32 to_unicode_uppercase(u32 code_point)
 #endif
 #endif
 }
 }
 
 
-String to_unicode_lowercase_full(StringView const& string)
+String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
 {
 {
 #if ENABLE_UNICODE_DATA
 #if ENABLE_UNICODE_DATA
     Utf8View view { string };
     Utf8View view { string };
@@ -133,7 +171,7 @@ String to_unicode_lowercase_full(StringView const& string)
             continue;
             continue;
         }
         }
 
 
-        auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
+        auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
         if (!special_casing) {
         if (!special_casing) {
             builder.append_code_point(unicode_data->simple_lowercase_mapping);
             builder.append_code_point(unicode_data->simple_lowercase_mapping);
             continue;
             continue;
@@ -149,7 +187,7 @@ String to_unicode_lowercase_full(StringView const& string)
 #endif
 #endif
 }
 }
 
 
-String to_unicode_uppercase_full(StringView const& string)
+String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
 {
 {
 #if ENABLE_UNICODE_DATA
 #if ENABLE_UNICODE_DATA
     Utf8View view { string };
     Utf8View view { string };
@@ -168,7 +206,7 @@ String to_unicode_uppercase_full(StringView const& string)
             continue;
             continue;
         }
         }
 
 
-        auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
+        auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
         if (!special_casing) {
         if (!special_casing) {
             builder.append_code_point(unicode_data->simple_uppercase_mapping);
             builder.append_code_point(unicode_data->simple_uppercase_mapping);
             continue;
             continue;

+ 3 - 2
Userland/Libraries/LibUnicode/CharacterTypes.h

@@ -7,6 +7,7 @@
 #pragma once
 #pragma once
 
 
 #include <AK/Forward.h>
 #include <AK/Forward.h>
+#include <AK/Optional.h>
 #include <AK/String.h>
 #include <AK/String.h>
 #include <AK/Types.h>
 #include <AK/Types.h>
 #include <LibUnicode/Forward.h>
 #include <LibUnicode/Forward.h>
@@ -18,8 +19,8 @@ namespace Unicode {
 u32 to_unicode_lowercase(u32 code_point);
 u32 to_unicode_lowercase(u32 code_point);
 u32 to_unicode_uppercase(u32 code_point);
 u32 to_unicode_uppercase(u32 code_point);
 
 
-String to_unicode_lowercase_full(StringView const&);
-String to_unicode_uppercase_full(StringView const&);
+String to_unicode_lowercase_full(StringView const&, Optional<StringView> locale = {});
+String to_unicode_uppercase_full(StringView const&, Optional<StringView> locale = {});
 
 
 Optional<GeneralCategory> general_category_from_string(StringView const&);
 Optional<GeneralCategory> general_category_from_string(StringView const&);
 bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
 bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);