Bläddra i källkod

LibUnicode: Replace Unicode string normalization with ICU

In a benchmark, ICU's implementation was over 3x faster than ours.
Timothy Flynn 1 år sedan
förälder
incheckning
83475c5380

+ 0 - 205
Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp

@@ -39,14 +39,6 @@ struct CaseFolding {
     Vector<u32> mapping { 0 };
 };
 
-// https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
-struct CodePointDecomposition {
-    // `tag` is a string since it's used for codegen as an enum value.
-    ByteString tag { "Canonical"sv };
-    size_t decomposition_index { 0 };
-    size_t decomposition_size { 0 };
-};
-
 // https://www.unicode.org/reports/tr44/#PropList.txt
 using PropList = HashMap<ByteString, Vector<Unicode::CodePointRange>>;
 
@@ -89,7 +81,6 @@ struct CodePointData {
     u32 code_point { 0 };
     ByteString name;
     ByteString bidi_class;
-    Optional<CodePointDecomposition> decomposition_mapping;
     Optional<i8> numeric_value_decimal;
     Optional<i8> numeric_value_digit;
     Optional<i8> numeric_value_numeric;
@@ -119,17 +110,7 @@ struct CodePointBidiClass {
     ByteString bidi_class;
 };
 
-struct CodePointComposition {
-    u32 second_code_point { 0 };
-    u32 combined_code_point { 0 };
-};
-
 struct UnicodeData {
-    u32 code_points_with_decomposition_mapping { 0 };
-    Vector<u32> decomposition_mappings;
-    HashMap<u32, Vector<CodePointComposition>> composition_mappings;
-    Vector<ByteString> compatibility_tags;
-
     Vector<SpecialCasing> special_casing;
     u32 largest_special_casing_mapping_size { 0 };
     Vector<ByteString> conditions;
@@ -500,60 +481,11 @@ static ErrorOr<void> parse_normalization_props(Core::InputBufferedFile& file, Un
     return {};
 }
 
-static Optional<CodePointDecomposition> parse_decomposition_mapping(StringView string, UnicodeData& unicode_data)
-{
-    if (string.is_empty())
-        return {};
-
-    CodePointDecomposition mapping;
-
-    auto parts = string.split_view(' ');
-
-    VERIFY(parts.size() > 0);
-
-    if (parts.first().starts_with('<')) {
-        auto const tag = parts.take_first().trim("<>"sv);
-
-        mapping.tag = ByteString::formatted("{:c}{}", to_ascii_uppercase(tag[0]), tag.substring_view(1));
-
-        if (!unicode_data.compatibility_tags.contains_slow(mapping.tag))
-            unicode_data.compatibility_tags.append(mapping.tag);
-    }
-
-    mapping.decomposition_index = unicode_data.decomposition_mappings.size();
-    mapping.decomposition_size = parts.size();
-    for (auto part : parts) {
-        unicode_data.decomposition_mappings.append(AK::StringUtils::convert_to_uint_from_hex<u32>(part).value());
-    }
-
-    return mapping;
-}
-
-static void add_composition_mapping(u32 code_point, CodePointDecomposition& decomposition, UnicodeData& unicode_data, Vector<Unicode::CodePointRange> const& full_composition_exclusion_code_points)
-{
-    if (decomposition.decomposition_size != 2)
-        return;
-    if (decomposition.tag != "Canonical"sv)
-        return;
-    static Unicode::CodePointRangeComparator comparator {};
-    for (auto const& range : full_composition_exclusion_code_points) {
-        auto comparison = comparator(code_point, range);
-        if (comparison == 0)
-            return;
-        if (comparison < 0)
-            break;
-    }
-    u32 const first_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index];
-    u32 const second_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index + 1];
-    unicode_data.composition_mappings.ensure(first_code_point).append(CodePointComposition { .second_code_point = second_code_point, .combined_code_point = code_point });
-}
-
 static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data)
 {
     Optional<u32> code_point_range_start;
 
     auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value;
-    auto const& full_composition_exclusion_code_points = unicode_data.prop_list.find("Full_Composition_Exclusion"sv)->value;
     Optional<u32> assigned_code_point_range_start = 0;
     u32 previous_code_point = 0;
 
@@ -573,7 +505,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
         data.name = segments[1];
         data.casing.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value();
         data.bidi_class = segments[4];
-        data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data);
         data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
         data.numeric_value_digit = AK::StringUtils::convert_to_int<i8>(segments[7]);
         data.numeric_value_numeric = AK::StringUtils::convert_to_int<i8>(segments[8]);
@@ -626,10 +557,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
                 data.casing.case_folding_indices.append(i);
         }
 
-        unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value();
-        if (data.decomposition_mapping.has_value())
-            add_composition_mapping(data.code_point, *data.decomposition_mapping, unicode_data, full_composition_exclusion_code_points);
-
         unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
 
         previous_code_point = data.code_point;
@@ -701,7 +628,6 @@ namespace Unicode {
     generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
     generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
     generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
-    generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags);
     generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
 
     generator.append(R"~~~(
@@ -729,25 +655,6 @@ struct CaseFolding {
     u32 mapping_size { 0 };
 };
 
-struct CodePointDecompositionRaw {
-    u32 code_point { 0 };
-    CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
-    size_t decomposition_index { 0 };
-    size_t decomposition_count { 0 };
-};
-
-struct CodePointDecomposition {
-    u32 code_point { 0 };
-    CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
-    ReadonlySpan<u32> decomposition;
-};
-
-struct CodePointCompositionRaw {
-    u32 code_point { 0 };
-    u32 second_code_point { 0 };
-    u32 combined_code_point { 0 };
-};
-
 Optional<Locale> locale_from_string(StringView locale);
 
 ReadonlySpan<SpecialCasing> special_case_mapping(u32 code_point);
@@ -875,96 +782,6 @@ struct CodePointBidiClassComparator : public CodePointRangeComparator {
     }
 };
 
-)~~~");
-
-    generator.set("decomposition_mappings_size", ByteString::number(unicode_data.decomposition_mappings.size()));
-    generator.append("\nstatic constexpr Array<u32, @decomposition_mappings_size@> s_decomposition_mappings_data { ");
-    generator.append(ByteString::join(", "sv, unicode_data.decomposition_mappings, "{:#x}"sv));
-    generator.append(" };\n");
-
-    auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) {
-        generator.set("name", name);
-        generator.set("mapping_type", mapping_type);
-        generator.set("size", ByteString::number(size));
-
-        generator.append(R"~~~(
-static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
-    )~~~");
-
-        constexpr size_t max_mappings_per_row = 20;
-        size_t mappings_in_current_row = 0;
-
-        for (auto const& data : unicode_data.code_point_data) {
-            auto mapping = mapping_getter(data);
-
-            if constexpr (requires { mapping.has_value(); }) {
-                if (!mapping.has_value())
-                    continue;
-            } else {
-                if (mapping.is_empty())
-                    continue;
-            }
-
-            if (mappings_in_current_row++ > 0)
-                generator.append(" ");
-
-            generator.set("code_point", ByteString::formatted("{:#x}", data.code_point));
-            generator.append("{ @code_point@");
-
-            if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<size_t>>) {
-                generator.set("mapping", ByteString::formatted("{:#x}", *mapping));
-                generator.append(", @mapping@ },");
-            } else if constexpr (IsSame<decltype(mapping), Optional<CodePointDecomposition>>) {
-                generator.set("tag", mapping->tag);
-                generator.set("start", ByteString::number(mapping->decomposition_index));
-                generator.set("size", ByteString::number(mapping->decomposition_size));
-                generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },");
-            } else {
-                append_list_and_size(mapping, "&s_@name@[{}]"sv);
-                generator.append(" },");
-            }
-
-            if (mappings_in_current_row == max_mappings_per_row) {
-                mappings_in_current_row = 0;
-                generator.append("\n    ");
-            }
-        }
-        generator.append(R"~~~(
-} };
-)~~~");
-    };
-
-    append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; });
-
-    size_t composition_mappings_size = 0;
-    for (auto const& entry : unicode_data.composition_mappings)
-        composition_mappings_size += entry.value.size();
-    generator.set("composition_mappings_size", ByteString::number(composition_mappings_size));
-    generator.append(R"~~~(
-static constexpr Array<CodePointCompositionRaw, @composition_mappings_size@> s_composition_mappings { {
-    )~~~");
-    constexpr size_t max_mappings_per_row = 40;
-    size_t mappings_in_current_row = 0;
-    auto first_code_points = unicode_data.composition_mappings.keys();
-    quick_sort(first_code_points);
-    for (auto const first_code_point : first_code_points) {
-        for (auto const& mapping : unicode_data.composition_mappings.find(first_code_point)->value) {
-            if (mappings_in_current_row++ > 0)
-                generator.append(" ");
-
-            generator.set("code_point", ByteString::formatted("{:#x}", first_code_point));
-            generator.set("second_code_point", ByteString::formatted("{:#x}", mapping.second_code_point));
-            generator.set("combined_code_point", ByteString::formatted("{:#x}", mapping.combined_code_point));
-            generator.append("{ @code_point@, @second_code_point@, @combined_code_point@ },");
-
-            if (mappings_in_current_row == max_mappings_per_row) {
-                mappings_in_current_row = 0;
-                generator.append("\n    ");
-            }
-        }
-    }
-    generator.append(R"~~~(
-} };
 )~~~");
 
     auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> {
@@ -1167,28 +984,6 @@ ReadonlySpan<CaseFolding> case_folding_mapping(u32 code_point)
     return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size);
 }
 
-Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point)
-{
-    auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator<CodePointDecompositionRaw> {});
-    if (mapping == nullptr)
-        return {};
-    return CodePointDecomposition { mapping->code_point, mapping->tag, ReadonlySpan<u32> { s_decomposition_mappings_data.data() + mapping->decomposition_index, mapping->decomposition_count } };
-}
-
-Optional<u32> code_point_composition(u32 first_code_point, u32 second_code_point)
-{
-    size_t mapping_index;
-    if (!binary_search(s_composition_mappings, first_code_point, &mapping_index, CodePointComparator<CodePointCompositionRaw> {}))
-        return {};
-    while (mapping_index > 0 && s_composition_mappings[mapping_index - 1].code_point == first_code_point)
-        mapping_index--;
-    for (; mapping_index < s_composition_mappings.size() && s_composition_mappings[mapping_index].code_point == first_code_point; ++mapping_index) {
-        if (s_composition_mappings[mapping_index].second_code_point == second_code_point)
-            return s_composition_mappings[mapping_index].combined_code_point;
-    }
-    return {};
-}
-
 Optional<BidirectionalClass> bidirectional_class(u32 code_point)
 {
     if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {}))

+ 4 - 0
Userland/Libraries/LibUnicode/CMakeLists.txt

@@ -16,4 +16,8 @@ set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED})
 
 serenity_lib(LibUnicode unicode)
 
+find_package(ICU REQUIRED COMPONENTS data uc)
+target_include_directories(LibUnicode PRIVATE ${ICU_INCLUDE_DIRS})
+target_link_libraries(LibUnicode PUBLIC ${ICU_LIBRARIES})
+
 target_compile_definitions(LibUnicode PRIVATE ENABLE_UNICODE_DATA=$<BOOL:${ENABLE_UNICODE_DATABASE_DOWNLOAD}>)

+ 27 - 257
Userland/Libraries/LibUnicode/Normalize.cpp

@@ -1,27 +1,20 @@
 /*
  * Copyright (c) 2022, mat
+ * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
-#include <AK/Find.h>
-#include <AK/QuickSort.h>
-#include <AK/Utf8View.h>
-#include <AK/Vector.h>
-#include <LibUnicode/CharacterTypes.h>
+#define AK_DONT_REPLACE_STD
+
+#include <AK/StringBuilder.h>
+#include <LibLocale/ICU.h>
 #include <LibUnicode/Normalize.h>
 
-#if ENABLE_UNICODE_DATA
-#    include <LibUnicode/UnicodeData.h>
-#else
-struct Unicode::CodePointDecomposition { };
-#endif
+#include <unicode/normalizer2.h>
 
 namespace Unicode {
 
-Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
-Optional<u32> __attribute__((weak)) code_point_composition(u32, u32) { return {}; }
-
 NormalizationForm normalization_form_from_string(StringView form)
 {
     if (form == "NFD"sv)
@@ -50,260 +43,37 @@ StringView normalization_form_to_string(NormalizationForm form)
     VERIFY_NOT_REACHED();
 }
 
-ALWAYS_INLINE static bool is_starter(u32 code_point)
-{
-    return Unicode::canonical_combining_class(code_point) == 0;
-}
-
-// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
-static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
-static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
-static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
-static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
-static constexpr u32 HANGUL_LEADING_COUNT = 19;
-static constexpr u32 HANGUL_VOWEL_COUNT = 21;
-static constexpr u32 HANGUL_TRAILING_COUNT = 28;
-// NCount in the standard.
-static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
-static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;
-
-ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
-{
-    return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
-}
-
-ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
-{
-    return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
-}
-
-ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
-{
-    return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
-}
-
-ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
-{
-    return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
-}
-
-// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
-static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
-{
-    auto const index = code_point - HANGUL_SYLLABLE_BASE;
-
-    auto const leading_index = index / HANGUL_BLOCK_COUNT;
-    auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
-    auto const trailing_index = index % HANGUL_TRAILING_COUNT;
-
-    auto const leading_part = HANGUL_LEADING_BASE + leading_index;
-    auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
-    auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
-
-    code_points_output.append(leading_part);
-    code_points_output.append(vowel_part);
-    if (trailing_index != 0)
-        code_points_output.append(trailing_part);
-}
-
-// L, V and LV, T Hangul Syllable Composition
-// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
-static u32 combine_hangul_code_points(u32 a, u32 b)
-{
-    if (is_hangul_leading(a) && is_hangul_vowel(b)) {
-        auto const leading_index = a - HANGUL_LEADING_BASE;
-        auto const vowel_index = b - HANGUL_VOWEL_BASE;
-        auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
-        return HANGUL_SYLLABLE_BASE + leading_vowel_index;
-    }
-    // LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
-    if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
-        return a + b - HANGUL_TRAILING_BASE;
-    }
-    return 0;
-}
-
-static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b)
-{
-#if ENABLE_UNICODE_DATA
-    auto composition = code_point_composition(a, b);
-    if (composition.has_value())
-        return composition.value();
-#endif
-
-    return 0;
-}
-
-enum class UseCompatibility {
-    Yes,
-    No
-};
-
-static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
-{
-    if (is_hangul_code_point(code_point))
-        return decompose_hangul_code_point(code_point, code_points_output);
-
-#if ENABLE_UNICODE_DATA
-    auto const mapping = Unicode::code_point_decomposition(code_point);
-    if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
-        for (auto code_point : mapping->decomposition) {
-            decompose_code_point(code_point, code_points_output, use_compatibility);
-        }
-    } else {
-        code_points_output.append(code_point);
-    }
-#endif
-}
-
-// This can be any sorting algorithm that maintains order (like std::stable_sort),
-// however bubble sort is easier to implement, so go with it (for now).
-template<typename T, typename LessThan>
-void bubble_sort(Span<T> span, LessThan less_than)
-{
-    for (size_t i = 0; i < span.size() - 1; ++i) {
-        for (size_t j = 0; j < span.size() - 1 - i; ++j) {
-            if (!less_than(span[j], span[j + 1]))
-                swap(span[j], span[j + 1]);
-        }
-    }
-}
-
-// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
-// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
-// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
-static void canonical_ordering_algorithm(Span<u32> code_points)
-{
-    for (size_t i = 0; i < code_points.size(); ++i) {
-        if (!is_starter(code_points[i])) {
-            auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
-            auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
-            // Nothing to reorder, so continue.
-            if (span_size <= 1)
-                continue;
-            Span<u32> const span { code_points.data() + i, span_size };
-
-            bubble_sort(span, [](u32 a, u32 b) {
-                // Use <= to keep ordering.
-                return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
-            });
-
-            // Skip over span we just sorted.
-            i += span_size - 1;
-        }
-    }
-}
-
-// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
-static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
-{
-    if (a == c - 1)
-        return false;
-    auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
-    auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
-    return b_combining_class >= c_combining_class;
-}
-
-// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
-// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
-// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
-static void canonical_composition_algorithm(Vector<u32>& code_points)
-{
-    if (code_points.size() <= 1)
-        return;
-    ssize_t last_starter = is_starter(code_points[0]) ? 0 : -1;
-    for (size_t i = 1; i < code_points.size(); ++i) {
-        auto const current_character = code_points[i];
-        // R1. Seek back (left) to find the last Starter L preceding C in the character sequence
-        if (last_starter == -1) {
-            if (is_starter(current_character))
-                last_starter = i;
-            continue;
-        }
-        // R2. If there is such an L, and C is not blocked from L,
-        //     and there exists a Primary Composite P which is canonically equivalent to <L, C>,
-        //     then replace L by P in the sequence and delete C from the sequence.
-        if (is_blocked(code_points.span(), last_starter, i)) {
-            if (is_starter(current_character))
-                last_starter = i;
-            continue;
-        }
-
-        auto composite = combine_hangul_code_points(code_points[last_starter], current_character);
-
-        if (composite == 0)
-            composite = combine_code_points(code_points[last_starter], current_character);
-
-        if (composite == 0) {
-            if (is_starter(current_character))
-                last_starter = i;
-            continue;
-        }
-
-        code_points[last_starter] = composite;
-        code_points.remove(i);
-        --i;
-    }
-}
-
-static Vector<u32> normalize_nfd(Utf8View string)
-{
-    Vector<u32> result;
-    for (auto const code_point : string)
-        decompose_code_point(code_point, result, UseCompatibility::No);
-
-    canonical_ordering_algorithm(result);
-    return result;
-}
-
-static Vector<u32> normalize_nfc(Utf8View string)
-{
-    auto result = normalize_nfd(string);
-    canonical_composition_algorithm(result);
-
-    return result;
-}
-
-static Vector<u32> normalize_nfkd(Utf8View string)
-{
-    Vector<u32> result;
-    for (auto const code_point : string)
-        decompose_code_point(code_point, result, UseCompatibility::Yes);
-
-    canonical_ordering_algorithm(result);
-    return result;
-}
-
-static Vector<u32> normalize_nfkc(Utf8View string)
+String normalize(StringView string, NormalizationForm form)
 {
-    auto result = normalize_nfkd(string);
-    canonical_composition_algorithm(result);
-
-    return result;
-}
+    UErrorCode status = U_ZERO_ERROR;
+    icu::Normalizer2 const* normalizer = nullptr;
 
-static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
-{
     switch (form) {
     case NormalizationForm::NFD:
-        return normalize_nfd(string);
+        normalizer = icu::Normalizer2::getNFDInstance(status);
+        break;
     case NormalizationForm::NFC:
-        return normalize_nfc(string);
+        normalizer = icu::Normalizer2::getNFCInstance(status);
+        break;
     case NormalizationForm::NFKD:
-        return normalize_nfkd(string);
+        normalizer = icu::Normalizer2::getNFKDInstance(status);
+        break;
     case NormalizationForm::NFKC:
-        return normalize_nfkc(string);
+        normalizer = icu::Normalizer2::getNFKCInstance(status);
+        break;
     }
-    VERIFY_NOT_REACHED();
-}
 
-String normalize(StringView string, NormalizationForm form)
-{
-    auto const code_points = normalize_implementation(Utf8View { string }, form);
+    if (Locale::icu_failure(status))
+        return MUST(String::from_utf8(string));
+
+    VERIFY(normalizer);
+
+    StringBuilder builder { string.length() };
+    icu::StringByteSink sink { &builder };
 
-    StringBuilder builder;
-    for (auto code_point : code_points)
-        builder.append_code_point(code_point);
+    normalizer->normalizeUTF8(0, Locale::icu_string_piece(string), sink, nullptr, status);
+    if (Locale::icu_failure(status))
+        return MUST(String::from_utf8(string));
 
     return MUST(builder.to_string());
 }

+ 3 - 11
Userland/Libraries/LibUnicode/Normalize.h

@@ -1,33 +1,25 @@
 /*
  * Copyright (c) 2022, mat
+ * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
 #pragma once
 
-#include <AK/Error.h>
-#include <AK/Forward.h>
-#include <AK/Optional.h>
-#include <AK/Span.h>
 #include <AK/String.h>
 #include <AK/StringView.h>
-#include <LibUnicode/Forward.h>
 
 namespace Unicode {
 
-Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point);
-Optional<u32> code_point_composition(u32 first_code_point, u32 second_code_point);
-
 enum class NormalizationForm {
     NFD,
     NFC,
     NFKD,
     NFKC
 };
-
-NormalizationForm normalization_form_from_string(StringView form);
-StringView normalization_form_to_string(NormalizationForm form);
+NormalizationForm normalization_form_from_string(StringView);
+StringView normalization_form_to_string(NormalizationForm);
 
 String normalize(StringView string, NormalizationForm form);