mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 15:40:19 +00:00
LibUnicode: Implement locale-aware BEFORE_DOT special casing
Note that the algorithm in the Unicode spec is for checking that a code point precedes U+0307, but the special casing condition NotBeforeDot is interested in the inverse of this rule.
This commit is contained in:
parent
436faf9fd9
commit
50158abaf1
Notes:
sideshowbarker
2024-07-18 04:36:30 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/50158abaf19 Pull-request: https://github.com/SerenityOS/serenity/pull/9841 Reviewed-by: https://github.com/IdanHo Reviewed-by: https://github.com/linusg ✅
2 changed files with 50 additions and 1 deletions
|
@ -243,6 +243,29 @@ TEST_CASE(to_unicode_lowercase_special_casing_more_above)
|
|||
EXPECT_EQ(result, "\u012f\u0307\u0300"sv);
|
||||
}
|
||||
|
||||
TEST_CASE(to_unicode_lowercase_special_casing_not_before_dot)
|
||||
{
|
||||
// LATIN CAPITAL LETTER I
|
||||
auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv);
|
||||
EXPECT_EQ(result, "\u0131"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv);
|
||||
EXPECT_EQ(result, "\u0131"sv);
|
||||
|
||||
// LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
|
||||
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv);
|
||||
EXPECT_EQ(result, "i\u0307"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
|
||||
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv);
|
||||
EXPECT_EQ(result, "i"sv);
|
||||
}
|
||||
|
||||
TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
|
||||
{
|
||||
// LATIN SMALL LETTER SHARP S
|
||||
|
|
|
@ -135,6 +135,30 @@ static bool is_followed_by_combining_class_above(Utf8View const& string, size_t
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
|
||||
{
|
||||
// C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
|
||||
// intervene between the current character and the combining dot above.
|
||||
auto following_view = ((index + byte_length) < string.byte_length())
|
||||
? string.substring_view(index + byte_length)
|
||||
: Utf8View {};
|
||||
|
||||
for (auto code_point : following_view) {
|
||||
if (code_point == 0x307)
|
||||
return true;
|
||||
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value())
|
||||
return false;
|
||||
if (unicode_data->canonical_combining_class == 0)
|
||||
return false;
|
||||
if (unicode_data->canonical_combining_class == 230)
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data)
|
||||
{
|
||||
auto requested_locale = Locale::None;
|
||||
|
@ -174,7 +198,9 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O
|
|||
return special_casing;
|
||||
break;
|
||||
|
||||
default:
|
||||
case Condition::NotBeforeDot:
|
||||
if (!is_followed_by_combining_dot_above(string, index, byte_length))
|
||||
return special_casing;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue