4 gadi atpakaļ · 39f971e42b
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@@ -6,6 +6,7 @@
 
				 
			
 
				 #include <LibTest/TestCase.h>
			
 
				 
			
 
				+#include <AK/StringView.h>
			
 
				 #include <LibUnicode/CharacterTypes.h>
			
 
				 #include <ctype.h>
			
 
				 
			
@@ -48,3 +49,151 @@ TEST_CASE(to_unicode_uppercase)
 
				     EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
			
 
				     EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
			
 
				 }
			
 
				+
			
 
				+TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
			
 
				+{
			
 
				+    // LATIN SMALL LETTER SHARP S
			
 
				+    auto result = Unicode::to_unicode_lowercase_full("\u00DF"sv);
			
 
				+    EXPECT_EQ(result, "\u00DF");
			
 
				+
			
 
				+    // LATIN CAPITAL LETTER I WITH DOT ABOVE
			
 
				+    result = Unicode::to_unicode_lowercase_full("\u0130"sv);
			
 
				+    EXPECT_EQ(result, "\u0069\u0307");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FF
			
 
				+    result = Unicode::to_unicode_lowercase_full("\uFB00"sv);
			
 
				+    EXPECT_EQ(result, "\uFB00");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FI
			
 
				+    result = Unicode::to_unicode_lowercase_full("\uFB01"sv);
			
 
				+    EXPECT_EQ(result, "\uFB01");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FL
			
 
				+    result = Unicode::to_unicode_lowercase_full("\uFB02"sv);
			
 
				+    EXPECT_EQ(result, "\uFB02");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FFI
			
 
				+    result = Unicode::to_unicode_lowercase_full("\uFB03"sv);
			
 
				+    EXPECT_EQ(result, "\uFB03");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FFL
			
 
				+    result = Unicode::to_unicode_lowercase_full("\uFB04"sv);
			
 
				+    EXPECT_EQ(result, "\uFB04");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE LONG S T
			
 
				+    result = Unicode::to_unicode_lowercase_full("\uFB05"sv);
			
 
				+    EXPECT_EQ(result, "\uFB05");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE ST
			
 
				+    result = Unicode::to_unicode_lowercase_full("\uFB06"sv);
			
 
				+    EXPECT_EQ(result, "\uFB06");
			
 
				+
			
 
				+    // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
			
 
				+    result = Unicode::to_unicode_lowercase_full("\u1FB7"sv);
			
 
				+    EXPECT_EQ(result, "\u1FB7");
			
 
				+
			
 
				+    // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
			
 
				+    result = Unicode::to_unicode_lowercase_full("\u1FC7"sv);
			
 
				+    EXPECT_EQ(result, "\u1FC7");
			
 
				+
			
 
				+    // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
			
 
				+    result = Unicode::to_unicode_lowercase_full("\u1FF7"sv);
			
 
				+    EXPECT_EQ(result, "\u1FF7");
			
 
				+}
			
 
				+
			
 
				+TEST_CASE(to_unicode_lowercase_special_casing_sigma)
			
 
				+{
			
 
				+    auto result = Unicode::to_unicode_lowercase_full("ABCI"sv);
			
 
				+    EXPECT_EQ(result, "abci");
			
 
				+
			
 
				+    // Sigma preceded by A
			
 
				+    result = Unicode::to_unicode_lowercase_full("A\u03A3"sv);
			
 
				+    EXPECT_EQ(result, "a\u03C2");
			
 
				+
			
 
				+    // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR
			
 
				+    result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv);
			
 
				+    EXPECT_EQ(result, "a\u180E\u03C2");
			
 
				+
			
 
				+    // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B
			
 
				+    result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv);
			
 
				+    EXPECT_EQ(result, "a\u180E\u03C3b");
			
 
				+
			
 
				+    // Sigma followed by A
			
 
				+    result = Unicode::to_unicode_lowercase_full("\u03A3A"sv);
			
 
				+    EXPECT_EQ(result, "\u03C3a");
			
 
				+
			
 
				+    // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR
			
 
				+    result = Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv);
			
 
				+    EXPECT_EQ(result, "a\u03C2\u180E");
			
 
				+
			
 
				+    // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B
			
 
				+    result = Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv);
			
 
				+    EXPECT_EQ(result, "a\u03C3\u180Eb");
			
 
				+
			
 
				+    // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR
			
 
				+    result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv);
			
 
				+    EXPECT_EQ(result, "a\u180E\u03C2\u180E");
			
 
				+
			
 
				+    // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B
			
 
				+    result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv);
			
 
				+    EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
			
 
				+}
			
 
				+
			
 
				+TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
			
 
				+{
			
 
				+    // LATIN SMALL LETTER SHARP S
			
 
				+    auto result = Unicode::to_unicode_uppercase_full("\u00DF"sv);
			
 
				+    EXPECT_EQ(result, "\u0053\u0053");
			
 
				+
			
 
				+    // LATIN CAPITAL LETTER I WITH DOT ABOVE
			
 
				+    result = Unicode::to_unicode_uppercase_full("\u0130"sv);
			
 
				+    EXPECT_EQ(result, "\u0130");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FF
			
 
				+    result = Unicode::to_unicode_uppercase_full("\uFB00"sv);
			
 
				+    EXPECT_EQ(result, "\u0046\u0046");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FI
			
 
				+    result = Unicode::to_unicode_uppercase_full("\uFB01"sv);
			
 
				+    EXPECT_EQ(result, "\u0046\u0049");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FL
			
 
				+    result = Unicode::to_unicode_uppercase_full("\uFB02"sv);
			
 
				+    EXPECT_EQ(result, "\u0046\u004C");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FFI
			
 
				+    result = Unicode::to_unicode_uppercase_full("\uFB03"sv);
			
 
				+    EXPECT_EQ(result, "\u0046\u0046\u0049");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE FFL
			
 
				+    result = Unicode::to_unicode_uppercase_full("\uFB04"sv);
			
 
				+    EXPECT_EQ(result, "\u0046\u0046\u004C");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE LONG S T
			
 
				+    result = Unicode::to_unicode_uppercase_full("\uFB05"sv);
			
 
				+    EXPECT_EQ(result, "\u0053\u0054");
			
 
				+
			
 
				+    // LATIN SMALL LIGATURE ST
			
 
				+    result = Unicode::to_unicode_uppercase_full("\uFB06"sv);
			
 
				+    EXPECT_EQ(result, "\u0053\u0054");
			
 
				+
			
 
				+    // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
			
 
				+    result = Unicode::to_unicode_uppercase_full("\u0390"sv);
			
 
				+    EXPECT_EQ(result, "\u0399\u0308\u0301");
			
 
				+
			
 
				+    // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
			
 
				+    result = Unicode::to_unicode_uppercase_full("\u03B0"sv);
			
 
				+    EXPECT_EQ(result, "\u03A5\u0308\u0301");
			
 
				+
			
 
				+    // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
			
 
				+    result = Unicode::to_unicode_uppercase_full("\u1FB7"sv);
			
 
				+    EXPECT_EQ(result, "\u0391\u0342\u0399");
			
 
				+
			
 
				+    // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
			
 
				+    result = Unicode::to_unicode_uppercase_full("\u1FC7"sv);
			
 
				+    EXPECT_EQ(result, "\u0397\u0342\u0399");
			
 
				+
			
 
				+    // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
			
 
				+    result = Unicode::to_unicode_uppercase_full("\u1FF7"sv);
			
 
				+    EXPECT_EQ(result, "\u03A9\u0342\u0399");
			
 
				+}
			
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -5,7 +5,9 @@
 
				  */
			
 
				 
			
 
				 #include <AK/Platform.h>
			
 
				+#include <AK/StringBuilder.h>
			
 
				 #include <AK/Types.h>
			
 
				+#include <AK/Utf8View.h>
			
 
				 #include <LibUnicode/CharacterTypes.h>
			
 
				 
			
 
				 #if ENABLE_UNICODE_DATA
			
@@ -14,8 +16,116 @@
 
				 #    include <AK/CharacterTypes.h>
			
 
				 #endif
			
 
				 
			
 
				+// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
			
 
				+// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
			
 
				+
			
 
				 namespace Unicode {
			
 
				 
			
 
				+#if ENABLE_UNICODE_DATA
			
 
				+
			
 
				+static bool is_cased_letter(UnicodeData const& unicode_data)
			
 
				+{
			
 
				+    // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property
			
 
				+    // or has a General_Category value of Titlecase_Letter.
			
 
				+    switch (unicode_data.general_category) {
			
 
				+    case GeneralCategory::Ll: // FIXME: Should be Ll + Other_Lowercase (PropList.txt).
			
 
				+    case GeneralCategory::Lu: // FIXME: Should be Lu + Other_Uppercase (PropList.txt).
			
 
				+    case GeneralCategory::Lt:
			
 
				+        return true;
			
 
				+    default:
			
 
				+        return false;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static bool is_case_ignorable(UnicodeData const& unicode_data)
			
 
				+{
			
 
				+    // A character C is defined to be case-ignorable if C has the value MidLetter (ML),
			
 
				+    // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is
			
 
				+    // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
			
 
				+    // Modifier_Symbol (Sk).
			
 
				+    switch (unicode_data.general_category) {
			
 
				+    case GeneralCategory::Mn:
			
 
				+    case GeneralCategory::Me:
			
 
				+    case GeneralCategory::Cf:
			
 
				+    case GeneralCategory::Lm:
			
 
				+    case GeneralCategory::Sk:
			
 
				+        return true;
			
 
				+    default:
			
 
				+        // FIXME: Handle word break properties (auxiliary/WordBreakProperty.txt).
			
 
				+        return false;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
			
 
				+{
			
 
				+    // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
			
 
				+    // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
			
 
				+    // characters and then a cased letter.
			
 
				+    auto preceding_view = string.substring_view(0, index);
			
 
				+    auto following_view = ((index + byte_length) < string.byte_length())
			
 
				+        ? string.substring_view(index + byte_length)
			
 
				+        : Utf8View {};
			
 
				+
			
 
				+    size_t cased_letter_count = 0;
			
 
				+
			
 
				+    for (auto code_point : preceding_view) {
			
 
				+        auto unicode_data = unicode_data_for_code_point(code_point);
			
 
				+        if (!unicode_data.has_value())
			
 
				+            return false;
			
 
				+
			
 
				+        if (is_cased_letter(*unicode_data))
			
 
				+            ++cased_letter_count;
			
 
				+        else if (!is_case_ignorable(*unicode_data))
			
 
				+            cased_letter_count = 0;
			
 
				+    }
			
 
				+
			
 
				+    if (cased_letter_count == 0)
			
 
				+        return false;
			
 
				+
			
 
				+    for (auto code_point : following_view) {
			
 
				+        auto unicode_data = unicode_data_for_code_point(code_point);
			
 
				+        if (!unicode_data.has_value())
			
 
				+            return false;
			
 
				+
			
 
				+        if (is_case_ignorable(*unicode_data))
			
 
				+            continue;
			
 
				+        if (is_cased_letter(*unicode_data))
			
 
				+            return false;
			
 
				+
			
 
				+        break;
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
			
 
				+{
			
 
				+    for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
			
 
				+        auto const* special_casing = unicode_data.special_casing[i];
			
 
				+
			
 
				+        if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
			
 
				+            return special_casing;
			
 
				+
			
 
				+        // FIXME: Handle locale.
			
 
				+        if (special_casing->locale != Locale::None)
			
 
				+            continue;
			
 
				+
			
 
				+        switch (special_casing->condition) {
			
 
				+        case Condition::FinalSigma:
			
 
				+            if (is_final_code_point(string, index, byte_length))
			
 
				+                return special_casing;
			
 
				+            break;
			
 
				+
			
 
				+        default:
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return nullptr;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 u32 to_unicode_lowercase(u32 code_point)
			
 
				 {
			
 
				 #if ENABLE_UNICODE_DATA
			
@@ -40,4 +150,74 @@ u32 to_unicode_uppercase(u32 code_point)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+String to_unicode_lowercase_full(StringView const& string)
			
 
				+{
			
 
				+#if ENABLE_UNICODE_DATA
			
 
				+    Utf8View view { string };
			
 
				+    StringBuilder builder;
			
 
				+
			
 
				+    size_t index = 0;
			
 
				+    for (auto it = view.begin(); it != view.end(); ++it) {
			
 
				+        u32 code_point = *it;
			
 
				+        size_t byte_length = it.underlying_code_point_length_in_bytes();
			
 
				+
			
 
				+        auto unicode_data = unicode_data_for_code_point(code_point);
			
 
				+        if (!unicode_data.has_value()) {
			
 
				+            builder.append_code_point(code_point);
			
 
				+            index += byte_length;
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
			
 
				+        if (!special_casing) {
			
 
				+            builder.append_code_point(unicode_data->simple_lowercase_mapping);
			
 
				+            index += byte_length;
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
			
 
				+            builder.append_code_point(special_casing->lowercase_mapping[i]);
			
 
				+    }
			
 
				+
			
 
				+    return builder.build();
			
 
				+#else
			
 
				+    return string.to_lowercase_string();
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+String to_unicode_uppercase_full(StringView const& string)
			
 
				+{
			
 
				+#if ENABLE_UNICODE_DATA
			
 
				+    Utf8View view { string };
			
 
				+    StringBuilder builder;
			
 
				+
			
 
				+    size_t index = 0;
			
 
				+    for (auto it = view.begin(); it != view.end(); ++it) {
			
 
				+        u32 code_point = *it;
			
 
				+        size_t byte_length = it.underlying_code_point_length_in_bytes();
			
 
				+
			
 
				+        auto unicode_data = unicode_data_for_code_point(code_point);
			
 
				+        if (!unicode_data.has_value()) {
			
 
				+            builder.append_code_point(code_point);
			
 
				+            index += byte_length;
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
			
 
				+        if (!special_casing) {
			
 
				+            builder.append_code_point(unicode_data->simple_uppercase_mapping);
			
 
				+            index += byte_length;
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
			
 
				+            builder.append_code_point(special_casing->uppercase_mapping[i]);
			
 
				+    }
			
 
				+
			
 
				+    return builder.build();
			
 
				+#else
			
 
				+    return string.to_uppercase_string();
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 }
			
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -6,11 +6,18 @@
 
				 
			
 
				 #pragma once
			
 
				 
			
 
				+#include <AK/Forward.h>
			
 
				+#include <AK/String.h>
			
 
				 #include <AK/Types.h>
			
 
				 
			
 
				 namespace Unicode {
			
 
				 
			
 
				+// Note: The single code point case conversions only perform simple case folding.
			
 
				+// Use the full-string transformations for full case folding.
			
 
				 u32 to_unicode_lowercase(u32 code_point);
			
 
				 u32 to_unicode_uppercase(u32 code_point);
			
 
				 
			
 
				+String to_unicode_lowercase_full(StringView const&);
			
 
				+String to_unicode_uppercase_full(StringView const&);
			
 
				+
			
 
				 }