LibUnicode: Implement locale-aware AFTER_I special casing

This commit is contained in:
Timothy Flynn 2021-09-05 14:39:20 -04:00 committed by Linus Groh
parent 68b2680040
commit 0053d48c41
Notes: sideshowbarker 2024-07-18 04:36:42 +09:00
3 changed files with 94 additions and 12 deletions

View file

@ -155,6 +155,49 @@ TEST_CASE(to_unicode_lowercase_special_casing_sigma)
EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
}
TEST_CASE(to_unicode_lowercase_special_casing_i)
{
// LATIN CAPITAL LETTER I
auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
EXPECT_EQ(result, "i"sv);
result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv);
EXPECT_EQ(result, "\u0131"sv);
result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv);
EXPECT_EQ(result, "\u0131"sv);
// LATIN CAPITAL LETTER I WITH DOT ABOVE
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv);
EXPECT_EQ(result, "\u0069\u0307"sv);
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv);
EXPECT_EQ(result, "i"sv);
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv);
EXPECT_EQ(result, "i"sv);
// LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv);
EXPECT_EQ(result, "i\u0307"sv);
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv);
EXPECT_EQ(result, "i"sv);
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv);
EXPECT_EQ(result, "i"sv);
// LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv);
EXPECT_EQ(result, "ia\u0307"sv);
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv);
EXPECT_EQ(result, "\u0131a\u0307"sv);
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv);
EXPECT_EQ(result, "\u0131a\u0307"sv);
}
TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S

View file

@ -10,6 +10,7 @@
#include <AK/Types.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/Locale.h>
#if ENABLE_UNICODE_DATA
# include <LibUnicode/UnicodeData.h>
@ -22,6 +23,32 @@ namespace Unicode {
#if ENABLE_UNICODE_DATA
static bool is_after_uppercase_i(Utf8View const& string, size_t index)
{
// There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
auto preceding_view = string.substring_view(0, index);
bool found_uppercase_i = false;
// FIXME: Would be better if Utf8View supported reverse iteration.
for (auto code_point : preceding_view) {
if (code_point == 'I') {
found_uppercase_i = true;
continue;
}
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value())
return false;
if (unicode_data->canonical_combining_class == 0)
found_uppercase_i = false;
else if (unicode_data->canonical_combining_class == 230)
found_uppercase_i = false;
}
return found_uppercase_i;
}
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
{
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
@ -62,19 +89,30 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
return true;
}
static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data)
{
auto requested_locale = Locale::None;
if (locale.has_value()) {
if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
requested_locale = *maybe_locale;
}
for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
auto const* special_casing = unicode_data.special_casing[i];
if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
return special_casing;
// FIXME: Handle locale.
if (special_casing->locale != Locale::None)
if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
continue;
switch (special_casing->condition) {
case Condition::None:
return special_casing;
case Condition::AfterI:
if (is_after_uppercase_i(string, index))
return special_casing;
break;
case Condition::FinalSigma:
if (is_final_code_point(string, index, byte_length))
return special_casing;
@ -114,7 +152,7 @@ u32 to_unicode_uppercase(u32 code_point)
#endif
}
String to_unicode_lowercase_full(StringView const& string)
String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
{
#if ENABLE_UNICODE_DATA
Utf8View view { string };
@ -133,7 +171,7 @@ String to_unicode_lowercase_full(StringView const& string)
continue;
}
auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
if (!special_casing) {
builder.append_code_point(unicode_data->simple_lowercase_mapping);
continue;
@ -149,7 +187,7 @@ String to_unicode_lowercase_full(StringView const& string)
#endif
}
String to_unicode_uppercase_full(StringView const& string)
String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
{
#if ENABLE_UNICODE_DATA
Utf8View view { string };
@ -168,7 +206,7 @@ String to_unicode_uppercase_full(StringView const& string)
continue;
}
auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
if (!special_casing) {
builder.append_code_point(unicode_data->simple_uppercase_mapping);
continue;

View file

@ -7,6 +7,7 @@
#pragma once
#include <AK/Forward.h>
#include <AK/Optional.h>
#include <AK/String.h>
#include <AK/Types.h>
#include <LibUnicode/Forward.h>
@ -18,8 +19,8 @@ namespace Unicode {
u32 to_unicode_lowercase(u32 code_point);
u32 to_unicode_uppercase(u32 code_point);
String to_unicode_lowercase_full(StringView const&);
String to_unicode_uppercase_full(StringView const&);
String to_unicode_lowercase_full(StringView const&, Optional<StringView> locale = {});
String to_unicode_uppercase_full(StringView const&, Optional<StringView> locale = {});
Optional<GeneralCategory> general_category_from_string(StringView const&);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);