mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 15:40:19 +00:00
LibUnicode: Implement locale-aware AFTER_I special casing
This commit is contained in:
parent
68b2680040
commit
0053d48c41
Notes:
sideshowbarker
2024-07-18 04:36:42 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/0053d48c41c Pull-request: https://github.com/SerenityOS/serenity/pull/9841 Reviewed-by: https://github.com/IdanHo Reviewed-by: https://github.com/linusg ✅
3 changed files with 94 additions and 12 deletions
|
@ -155,6 +155,49 @@ TEST_CASE(to_unicode_lowercase_special_casing_sigma)
|
|||
EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
|
||||
}
|
||||
|
||||
TEST_CASE(to_unicode_lowercase_special_casing_i)
|
||||
{
|
||||
// LATIN CAPITAL LETTER I
|
||||
auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv);
|
||||
EXPECT_EQ(result, "\u0131"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv);
|
||||
EXPECT_EQ(result, "\u0131"sv);
|
||||
|
||||
// LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv);
|
||||
EXPECT_EQ(result, "\u0069\u0307"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
|
||||
// LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
|
||||
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv);
|
||||
EXPECT_EQ(result, "i\u0307"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
|
||||
// LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
|
||||
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv);
|
||||
EXPECT_EQ(result, "ia\u0307"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv);
|
||||
EXPECT_EQ(result, "\u0131a\u0307"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv);
|
||||
EXPECT_EQ(result, "\u0131a\u0307"sv);
|
||||
}
|
||||
|
||||
TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
|
||||
{
|
||||
// LATIN SMALL LETTER SHARP S
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include <AK/Types.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/Locale.h>
|
||||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
# include <LibUnicode/UnicodeData.h>
|
||||
|
@ -22,6 +23,32 @@ namespace Unicode {
|
|||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
|
||||
static bool is_after_uppercase_i(Utf8View const& string, size_t index)
|
||||
{
|
||||
// There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
|
||||
auto preceding_view = string.substring_view(0, index);
|
||||
bool found_uppercase_i = false;
|
||||
|
||||
// FIXME: Would be better if Utf8View supported reverse iteration.
|
||||
for (auto code_point : preceding_view) {
|
||||
if (code_point == 'I') {
|
||||
found_uppercase_i = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value())
|
||||
return false;
|
||||
|
||||
if (unicode_data->canonical_combining_class == 0)
|
||||
found_uppercase_i = false;
|
||||
else if (unicode_data->canonical_combining_class == 230)
|
||||
found_uppercase_i = false;
|
||||
}
|
||||
|
||||
return found_uppercase_i;
|
||||
}
|
||||
|
||||
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
|
||||
{
|
||||
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
|
||||
|
@ -62,19 +89,30 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
|
|||
return true;
|
||||
}
|
||||
|
||||
static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
|
||||
static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data)
|
||||
{
|
||||
auto requested_locale = Locale::None;
|
||||
|
||||
if (locale.has_value()) {
|
||||
if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
|
||||
requested_locale = *maybe_locale;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
|
||||
auto const* special_casing = unicode_data.special_casing[i];
|
||||
|
||||
if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
|
||||
return special_casing;
|
||||
|
||||
// FIXME: Handle locale.
|
||||
if (special_casing->locale != Locale::None)
|
||||
if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
|
||||
continue;
|
||||
|
||||
switch (special_casing->condition) {
|
||||
case Condition::None:
|
||||
return special_casing;
|
||||
|
||||
case Condition::AfterI:
|
||||
if (is_after_uppercase_i(string, index))
|
||||
return special_casing;
|
||||
break;
|
||||
|
||||
case Condition::FinalSigma:
|
||||
if (is_final_code_point(string, index, byte_length))
|
||||
return special_casing;
|
||||
|
@ -114,7 +152,7 @@ u32 to_unicode_uppercase(u32 code_point)
|
|||
#endif
|
||||
}
|
||||
|
||||
String to_unicode_lowercase_full(StringView const& string)
|
||||
String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
Utf8View view { string };
|
||||
|
@ -133,7 +171,7 @@ String to_unicode_lowercase_full(StringView const& string)
|
|||
continue;
|
||||
}
|
||||
|
||||
auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
|
||||
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
|
||||
if (!special_casing) {
|
||||
builder.append_code_point(unicode_data->simple_lowercase_mapping);
|
||||
continue;
|
||||
|
@ -149,7 +187,7 @@ String to_unicode_lowercase_full(StringView const& string)
|
|||
#endif
|
||||
}
|
||||
|
||||
String to_unicode_uppercase_full(StringView const& string)
|
||||
String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
Utf8View view { string };
|
||||
|
@ -168,7 +206,7 @@ String to_unicode_uppercase_full(StringView const& string)
|
|||
continue;
|
||||
}
|
||||
|
||||
auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
|
||||
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
|
||||
if (!special_casing) {
|
||||
builder.append_code_point(unicode_data->simple_uppercase_mapping);
|
||||
continue;
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/Types.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
@ -18,8 +19,8 @@ namespace Unicode {
|
|||
u32 to_unicode_lowercase(u32 code_point);
|
||||
u32 to_unicode_uppercase(u32 code_point);
|
||||
|
||||
String to_unicode_lowercase_full(StringView const&);
|
||||
String to_unicode_uppercase_full(StringView const&);
|
||||
String to_unicode_lowercase_full(StringView const&, Optional<StringView> locale = {});
|
||||
String to_unicode_uppercase_full(StringView const&, Optional<StringView> locale = {});
|
||||
|
||||
Optional<GeneralCategory> general_category_from_string(StringView const&);
|
||||
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
|
||||
|
|
Loading…
Reference in a new issue