mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-12-05 05:50:31 +00:00
LibUnicode+LibJS: Propagate OOM from Unicode normalization
This commit is contained in:
parent
1ff29afc45
commit
3d22efccca
Notes:
sideshowbarker
2024-07-17 03:03:37 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/3d22efccca Pull-request: https://github.com/SerenityOS/serenity/pull/16909
4 changed files with 74 additions and 82 deletions
|
@ -12,84 +12,84 @@ using namespace Unicode;
|
||||||
|
|
||||||
TEST_CASE(normalize_nfd)
|
TEST_CASE(normalize_nfd)
|
||||||
{
|
{
|
||||||
EXPECT_EQ(normalize(""sv, NormalizationForm::NFD), ""sv);
|
EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFD)), ""sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("Hello"sv, NormalizationForm::NFD), "Hello"sv);
|
EXPECT_EQ(MUST(normalize("Hello"sv, NormalizationForm::NFD)), "Hello"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("Amélie"sv, NormalizationForm::NFD), "Ame\u0301lie"sv);
|
EXPECT_EQ(MUST(normalize("Amélie"sv, NormalizationForm::NFD)), "Ame\u0301lie"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("Office"sv, NormalizationForm::NFD), "Office"sv);
|
EXPECT_EQ(MUST(normalize("Office"sv, NormalizationForm::NFD)), "Office"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u1E9B\u0323"sv, NormalizationForm::NFD), "\u017F\u0323\u0307"sv);
|
EXPECT_EQ(MUST(normalize("\u1E9B\u0323"sv, NormalizationForm::NFD)), "\u017F\u0323\u0307"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u0112\u0300"sv, NormalizationForm::NFD), "\u0045\u0304\u0300"sv);
|
EXPECT_EQ(MUST(normalize("\u0112\u0300"sv, NormalizationForm::NFD)), "\u0045\u0304\u0300"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFD), "\u03D2\u0301"sv);
|
EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFD)), "\u03D2\u0301"sv);
|
||||||
EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFD), "\u03D2\u0308"sv);
|
EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFD)), "\u03D2\u0308"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("닭"sv, NormalizationForm::NFD), "\u1103\u1161\u11B0"sv);
|
EXPECT_EQ(MUST(normalize("닭"sv, NormalizationForm::NFD)), "\u1103\u1161\u11B0"sv);
|
||||||
EXPECT_EQ(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFD), "\u1100\u1100\u1161\u11A8"sv);
|
EXPECT_EQ(MUST(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFD)), "\u1100\u1100\u1161\u11A8"sv);
|
||||||
|
|
||||||
// Composition exclusions.
|
// Composition exclusions.
|
||||||
EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFD), "\u0915\u093C"sv);
|
EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFD)), "\u0915\u093C"sv);
|
||||||
EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFD), "\u03A9"sv);
|
EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFD)), "\u03A9"sv);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(normalize_nfc)
|
TEST_CASE(normalize_nfc)
|
||||||
{
|
{
|
||||||
EXPECT_EQ(normalize(""sv, NormalizationForm::NFC), ""sv);
|
EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFC)), ""sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("Hello"sv, NormalizationForm::NFC), "Hello"sv);
|
EXPECT_EQ(MUST(normalize("Hello"sv, NormalizationForm::NFC)), "Hello"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("Office"sv, NormalizationForm::NFC), "Office"sv);
|
EXPECT_EQ(MUST(normalize("Office"sv, NormalizationForm::NFC)), "Office"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u1E9B\u0323"sv, NormalizationForm::NFC), "\u1E9B\u0323"sv);
|
EXPECT_EQ(MUST(normalize("\u1E9B\u0323"sv, NormalizationForm::NFC)), "\u1E9B\u0323"sv);
|
||||||
EXPECT_EQ(normalize("\u0044\u0307"sv, NormalizationForm::NFC), "\u1E0A"sv);
|
EXPECT_EQ(MUST(normalize("\u0044\u0307"sv, NormalizationForm::NFC)), "\u1E0A"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u0044\u0307\u0323"sv, NormalizationForm::NFC), "\u1E0C\u0307"sv);
|
EXPECT_EQ(MUST(normalize("\u0044\u0307\u0323"sv, NormalizationForm::NFC)), "\u1E0C\u0307"sv);
|
||||||
EXPECT_EQ(normalize("\u0044\u0323\u0307"sv, NormalizationForm::NFC), "\u1E0C\u0307"sv);
|
EXPECT_EQ(MUST(normalize("\u0044\u0323\u0307"sv, NormalizationForm::NFC)), "\u1E0C\u0307"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u0112\u0300"sv, NormalizationForm::NFC), "\u1E14"sv);
|
EXPECT_EQ(MUST(normalize("\u0112\u0300"sv, NormalizationForm::NFC)), "\u1E14"sv);
|
||||||
EXPECT_EQ(normalize("\u1E14\u0304"sv, NormalizationForm::NFC), "\u1E14\u0304"sv);
|
EXPECT_EQ(MUST(normalize("\u1E14\u0304"sv, NormalizationForm::NFC)), "\u1E14\u0304"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F"sv, NormalizationForm::NFC), "\u05B1\u05B8\u05B9\u0591\u05C3\u05B0\u05AC\u059F"sv);
|
EXPECT_EQ(MUST(normalize("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F"sv, NormalizationForm::NFC)), "\u05B1\u05B8\u05B9\u0591\u05C3\u05B0\u05AC\u059F"sv);
|
||||||
EXPECT_EQ(normalize("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"sv, NormalizationForm::NFC), "\u05B0\u05B7\u05BC\u05A5\u0592\u05C0\u05AD\u05C4"sv);
|
EXPECT_EQ(MUST(normalize("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"sv, NormalizationForm::NFC)), "\u05B0\u05B7\u05BC\u05A5\u0592\u05C0\u05AD\u05C4"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFC), "\u03D3"sv);
|
EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFC)), "\u03D3"sv);
|
||||||
EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFC), "\u03D4"sv);
|
EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFC)), "\u03D4"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFC), "\u0915\u093C"sv);
|
EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFC)), "\u0915\u093C"sv);
|
||||||
EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFC), "\u03A9"sv);
|
EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFC)), "\u03A9"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u1103\u1161\u11B0"sv, NormalizationForm::NFC), "닭"sv);
|
EXPECT_EQ(MUST(normalize("\u1103\u1161\u11B0"sv, NormalizationForm::NFC)), "닭"sv);
|
||||||
EXPECT_EQ(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFC), "\u1100\uAC01"sv);
|
EXPECT_EQ(MUST(normalize("\u1100\uAC00\u11A8"sv, NormalizationForm::NFC)), "\u1100\uAC01"sv);
|
||||||
EXPECT_EQ(normalize("\u1103\u1161\u11B0\u11B0"sv, NormalizationForm::NFC), "닭\u11B0");
|
EXPECT_EQ(MUST(normalize("\u1103\u1161\u11B0\u11B0"sv, NormalizationForm::NFC)), "닭\u11B0");
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(normalize_nfkd)
|
TEST_CASE(normalize_nfkd)
|
||||||
{
|
{
|
||||||
EXPECT_EQ(normalize(""sv, NormalizationForm::NFKD), ""sv);
|
EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFKD)), ""sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("Office"sv, NormalizationForm::NFKD), "Office"sv);
|
EXPECT_EQ(MUST(normalize("Office"sv, NormalizationForm::NFKD)), "Office"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("¼"sv, NormalizationForm::NFKD), "1\u20444"sv);
|
EXPECT_EQ(MUST(normalize("¼"sv, NormalizationForm::NFKD)), "1\u20444"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFKD), "\u03A5\u0301"sv);
|
EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFKD)), "\u03A5\u0301"sv);
|
||||||
EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFKD), "\u03A5\u0308"sv);
|
EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFKD)), "\u03A5\u0308"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFKD), "\u0915\u093C"sv);
|
EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFKD)), "\u0915\u093C"sv);
|
||||||
EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFKD), "\u03A9"sv);
|
EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFKD)), "\u03A9"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\uFDFA"sv, NormalizationForm::NFKD), "\u0635\u0644\u0649\u0020\u0627\u0644\u0644\u0647\u0020\u0639\u0644\u064A\u0647\u0020\u0648\u0633\u0644\u0645"sv);
|
EXPECT_EQ(MUST(normalize("\uFDFA"sv, NormalizationForm::NFKD)), "\u0635\u0644\u0649\u0020\u0627\u0644\u0644\u0647\u0020\u0639\u0644\u064A\u0647\u0020\u0648\u0633\u0644\u0645"sv);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(normalize_nfkc)
|
TEST_CASE(normalize_nfkc)
|
||||||
{
|
{
|
||||||
EXPECT_EQ(normalize(""sv, NormalizationForm::NFKC), ""sv);
|
EXPECT_EQ(MUST(normalize(""sv, NormalizationForm::NFKC)), ""sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u03D3"sv, NormalizationForm::NFKC), "\u038E"sv);
|
EXPECT_EQ(MUST(normalize("\u03D3"sv, NormalizationForm::NFKC)), "\u038E"sv);
|
||||||
EXPECT_EQ(normalize("\u03D4"sv, NormalizationForm::NFKC), "\u03AB"sv);
|
EXPECT_EQ(MUST(normalize("\u03D4"sv, NormalizationForm::NFKC)), "\u03AB"sv);
|
||||||
|
|
||||||
EXPECT_EQ(normalize("\u0958"sv, NormalizationForm::NFKC), "\u0915\u093C"sv);
|
EXPECT_EQ(MUST(normalize("\u0958"sv, NormalizationForm::NFKC)), "\u0915\u093C"sv);
|
||||||
EXPECT_EQ(normalize("\u2126"sv, NormalizationForm::NFKC), "\u03A9"sv);
|
EXPECT_EQ(MUST(normalize("\u2126"sv, NormalizationForm::NFKC)), "\u03A9"sv);
|
||||||
}
|
}
|
||||||
|
|
|
@ -499,7 +499,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::normalize)
|
||||||
|
|
||||||
// 6. Let ns be the String value that is the result of normalizing S into the normalization form named by f as specified in https://unicode.org/reports/tr15/.
|
// 6. Let ns be the String value that is the result of normalizing S into the normalization form named by f as specified in https://unicode.org/reports/tr15/.
|
||||||
auto unicode_form = Unicode::normalization_form_from_string(form);
|
auto unicode_form = Unicode::normalization_form_from_string(form);
|
||||||
auto ns = Unicode::normalize(string, unicode_form);
|
auto ns = TRY_OR_THROW_OOM(vm, Unicode::normalize(string, unicode_form));
|
||||||
|
|
||||||
// 7. return ns.
|
// 7. return ns.
|
||||||
return PrimitiveString::create(vm, move(ns));
|
return PrimitiveString::create(vm, move(ns));
|
||||||
|
|
|
@ -88,7 +88,7 @@ ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
|
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
|
||||||
static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
|
static ErrorOr<void> decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
|
||||||
{
|
{
|
||||||
auto const index = code_point - HANGUL_SYLLABLE_BASE;
|
auto const index = code_point - HANGUL_SYLLABLE_BASE;
|
||||||
|
|
||||||
|
@ -100,10 +100,12 @@ static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points
|
||||||
auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
|
auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
|
||||||
auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
|
auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
|
||||||
|
|
||||||
code_points_output.append(leading_part);
|
TRY(code_points_output.try_append(leading_part));
|
||||||
code_points_output.append(vowel_part);
|
TRY(code_points_output.try_append(vowel_part));
|
||||||
if (trailing_index != 0)
|
if (trailing_index != 0)
|
||||||
code_points_output.append(trailing_part);
|
TRY(code_points_output.try_append(trailing_part));
|
||||||
|
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// L, V and LV, T Hangul Syllable Composition
|
// L, V and LV, T Hangul Syllable Composition
|
||||||
|
@ -150,23 +152,23 @@ enum class UseCompatibility {
|
||||||
No
|
No
|
||||||
};
|
};
|
||||||
|
|
||||||
static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
|
static ErrorOr<void> decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
|
||||||
{
|
{
|
||||||
if (is_hangul_code_point(code_point)) {
|
if (is_hangul_code_point(code_point))
|
||||||
decompose_hangul_code_point(code_point, code_points_output);
|
return decompose_hangul_code_point(code_point, code_points_output);
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if ENABLE_UNICODE_DATA
|
#if ENABLE_UNICODE_DATA
|
||||||
auto const mapping = Unicode::code_point_decomposition(code_point);
|
auto const mapping = Unicode::code_point_decomposition(code_point);
|
||||||
if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
|
if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
|
||||||
for (auto code_point : mapping->decomposition) {
|
for (auto code_point : mapping->decomposition) {
|
||||||
decompose_code_point(code_point, code_points_output, use_compatibility);
|
TRY(decompose_code_point(code_point, code_points_output, use_compatibility));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
code_points_output.append(code_point);
|
TRY(code_points_output.try_append(code_point));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// This can be any sorting algorithm that maintains order (like std::stable_sort),
|
// This can be any sorting algorithm that maintains order (like std::stable_sort),
|
||||||
|
@ -249,51 +251,43 @@ static void canonical_composition_algorithm(Vector<u32>& code_points)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static Vector<u32> normalize_nfd(Utf8View string)
|
static ErrorOr<Vector<u32>> normalize_nfd(Utf8View string)
|
||||||
{
|
{
|
||||||
Vector<u32> result;
|
Vector<u32> result;
|
||||||
|
for (auto const code_point : string)
|
||||||
for (auto const code_point : string) {
|
TRY(decompose_code_point(code_point, result, UseCompatibility::No));
|
||||||
decompose_code_point(code_point, result, UseCompatibility::No);
|
|
||||||
}
|
|
||||||
|
|
||||||
canonical_ordering_algorithm(result);
|
canonical_ordering_algorithm(result);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Vector<u32> normalize_nfc(Utf8View string)
|
static ErrorOr<Vector<u32>> normalize_nfc(Utf8View string)
|
||||||
{
|
{
|
||||||
auto result = normalize_nfd(string);
|
auto result = TRY(normalize_nfd(string));
|
||||||
|
|
||||||
canonical_composition_algorithm(result);
|
canonical_composition_algorithm(result);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Vector<u32> normalize_nfkd(Utf8View string)
|
static ErrorOr<Vector<u32>> normalize_nfkd(Utf8View string)
|
||||||
{
|
{
|
||||||
Vector<u32> result;
|
Vector<u32> result;
|
||||||
|
for (auto const code_point : string)
|
||||||
for (auto const code_point : string) {
|
TRY(decompose_code_point(code_point, result, UseCompatibility::Yes));
|
||||||
decompose_code_point(code_point, result, UseCompatibility::Yes);
|
|
||||||
}
|
|
||||||
|
|
||||||
canonical_ordering_algorithm(result);
|
canonical_ordering_algorithm(result);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Vector<u32> normalize_nfkc(Utf8View string)
|
static ErrorOr<Vector<u32>> normalize_nfkc(Utf8View string)
|
||||||
{
|
{
|
||||||
auto result = normalize_nfkd(string);
|
auto result = TRY(normalize_nfkd(string));
|
||||||
|
|
||||||
canonical_composition_algorithm(result);
|
canonical_composition_algorithm(result);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
|
static ErrorOr<Vector<u32>> normalize_implementation(Utf8View string, NormalizationForm form)
|
||||||
{
|
{
|
||||||
switch (form) {
|
switch (form) {
|
||||||
case NormalizationForm::NFD:
|
case NormalizationForm::NFD:
|
||||||
|
@ -308,16 +302,13 @@ static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm f
|
||||||
VERIFY_NOT_REACHED();
|
VERIFY_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
|
||||||
DeprecatedString normalize(StringView string, NormalizationForm form)
|
ErrorOr<DeprecatedString> normalize(StringView string, NormalizationForm form)
|
||||||
{
|
{
|
||||||
Utf8View const view { string };
|
auto const code_points = TRY(normalize_implementation(Utf8View { string }, form));
|
||||||
|
|
||||||
auto const code_points = normalize_implementation(view, form);
|
|
||||||
|
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
for (auto code_point : code_points) {
|
for (auto code_point : code_points)
|
||||||
builder.append_code_point(code_point);
|
TRY(builder.try_append_code_point(code_point));
|
||||||
}
|
|
||||||
|
|
||||||
return builder.to_deprecated_string();
|
return builder.to_deprecated_string();
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <AK/DeprecatedString.h>
|
#include <AK/DeprecatedString.h>
|
||||||
|
#include <AK/Error.h>
|
||||||
#include <AK/Forward.h>
|
#include <AK/Forward.h>
|
||||||
#include <AK/Optional.h>
|
#include <AK/Optional.h>
|
||||||
#include <AK/Span.h>
|
#include <AK/Span.h>
|
||||||
|
@ -28,6 +29,6 @@ enum class NormalizationForm {
|
||||||
NormalizationForm normalization_form_from_string(StringView form);
|
NormalizationForm normalization_form_from_string(StringView form);
|
||||||
StringView normalization_form_to_string(NormalizationForm form);
|
StringView normalization_form_to_string(NormalizationForm form);
|
||||||
|
|
||||||
[[nodiscard]] DeprecatedString normalize(StringView string, NormalizationForm form);
|
ErrorOr<DeprecatedString> normalize(StringView string, NormalizationForm form);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue