mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 09:00:22 +00:00
LibUnicode: Replace Unicode string normalization with ICU
In a benchmark, ICU's implementation was over 3x faster than ours.
This commit is contained in:
parent
187349e4db
commit
83475c5380
Notes:
sideshowbarker
2024-07-17 00:37:23 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/83475c5380 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/201
4 changed files with 41 additions and 480 deletions
|
@ -39,14 +39,6 @@ struct CaseFolding {
|
|||
Vector<u32> mapping { 0 };
|
||||
};
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
|
||||
struct CodePointDecomposition {
|
||||
// `tag` is a string since it's used for codegen as an enum value.
|
||||
ByteString tag { "Canonical"sv };
|
||||
size_t decomposition_index { 0 };
|
||||
size_t decomposition_size { 0 };
|
||||
};
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#PropList.txt
|
||||
using PropList = HashMap<ByteString, Vector<Unicode::CodePointRange>>;
|
||||
|
||||
|
@ -89,7 +81,6 @@ struct CodePointData {
|
|||
u32 code_point { 0 };
|
||||
ByteString name;
|
||||
ByteString bidi_class;
|
||||
Optional<CodePointDecomposition> decomposition_mapping;
|
||||
Optional<i8> numeric_value_decimal;
|
||||
Optional<i8> numeric_value_digit;
|
||||
Optional<i8> numeric_value_numeric;
|
||||
|
@ -119,17 +110,7 @@ struct CodePointBidiClass {
|
|||
ByteString bidi_class;
|
||||
};
|
||||
|
||||
struct CodePointComposition {
|
||||
u32 second_code_point { 0 };
|
||||
u32 combined_code_point { 0 };
|
||||
};
|
||||
|
||||
struct UnicodeData {
|
||||
u32 code_points_with_decomposition_mapping { 0 };
|
||||
Vector<u32> decomposition_mappings;
|
||||
HashMap<u32, Vector<CodePointComposition>> composition_mappings;
|
||||
Vector<ByteString> compatibility_tags;
|
||||
|
||||
Vector<SpecialCasing> special_casing;
|
||||
u32 largest_special_casing_mapping_size { 0 };
|
||||
Vector<ByteString> conditions;
|
||||
|
@ -500,60 +481,11 @@ static ErrorOr<void> parse_normalization_props(Core::InputBufferedFile& file, Un
|
|||
return {};
|
||||
}
|
||||
|
||||
static Optional<CodePointDecomposition> parse_decomposition_mapping(StringView string, UnicodeData& unicode_data)
|
||||
{
|
||||
if (string.is_empty())
|
||||
return {};
|
||||
|
||||
CodePointDecomposition mapping;
|
||||
|
||||
auto parts = string.split_view(' ');
|
||||
|
||||
VERIFY(parts.size() > 0);
|
||||
|
||||
if (parts.first().starts_with('<')) {
|
||||
auto const tag = parts.take_first().trim("<>"sv);
|
||||
|
||||
mapping.tag = ByteString::formatted("{:c}{}", to_ascii_uppercase(tag[0]), tag.substring_view(1));
|
||||
|
||||
if (!unicode_data.compatibility_tags.contains_slow(mapping.tag))
|
||||
unicode_data.compatibility_tags.append(mapping.tag);
|
||||
}
|
||||
|
||||
mapping.decomposition_index = unicode_data.decomposition_mappings.size();
|
||||
mapping.decomposition_size = parts.size();
|
||||
for (auto part : parts) {
|
||||
unicode_data.decomposition_mappings.append(AK::StringUtils::convert_to_uint_from_hex<u32>(part).value());
|
||||
}
|
||||
|
||||
return mapping;
|
||||
}
|
||||
|
||||
static void add_composition_mapping(u32 code_point, CodePointDecomposition& decomposition, UnicodeData& unicode_data, Vector<Unicode::CodePointRange> const& full_composition_exclusion_code_points)
|
||||
{
|
||||
if (decomposition.decomposition_size != 2)
|
||||
return;
|
||||
if (decomposition.tag != "Canonical"sv)
|
||||
return;
|
||||
static Unicode::CodePointRangeComparator comparator {};
|
||||
for (auto const& range : full_composition_exclusion_code_points) {
|
||||
auto comparison = comparator(code_point, range);
|
||||
if (comparison == 0)
|
||||
return;
|
||||
if (comparison < 0)
|
||||
break;
|
||||
}
|
||||
u32 const first_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index];
|
||||
u32 const second_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index + 1];
|
||||
unicode_data.composition_mappings.ensure(first_code_point).append(CodePointComposition { .second_code_point = second_code_point, .combined_code_point = code_point });
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data)
|
||||
{
|
||||
Optional<u32> code_point_range_start;
|
||||
|
||||
auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value;
|
||||
auto const& full_composition_exclusion_code_points = unicode_data.prop_list.find("Full_Composition_Exclusion"sv)->value;
|
||||
Optional<u32> assigned_code_point_range_start = 0;
|
||||
u32 previous_code_point = 0;
|
||||
|
||||
|
@ -573,7 +505,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
|
|||
data.name = segments[1];
|
||||
data.casing.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value();
|
||||
data.bidi_class = segments[4];
|
||||
data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data);
|
||||
data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
|
||||
data.numeric_value_digit = AK::StringUtils::convert_to_int<i8>(segments[7]);
|
||||
data.numeric_value_numeric = AK::StringUtils::convert_to_int<i8>(segments[8]);
|
||||
|
@ -626,10 +557,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
|
|||
data.casing.case_folding_indices.append(i);
|
||||
}
|
||||
|
||||
unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value();
|
||||
if (data.decomposition_mapping.has_value())
|
||||
add_composition_mapping(data.code_point, *data.decomposition_mapping, unicode_data, full_composition_exclusion_code_points);
|
||||
|
||||
unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
|
||||
|
||||
previous_code_point = data.code_point;
|
||||
|
@ -701,7 +628,6 @@ namespace Unicode {
|
|||
generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
|
||||
generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
|
||||
generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
|
||||
generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags);
|
||||
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
|
||||
|
||||
generator.append(R"~~~(
|
||||
|
@ -729,25 +655,6 @@ struct CaseFolding {
|
|||
u32 mapping_size { 0 };
|
||||
};
|
||||
|
||||
struct CodePointDecompositionRaw {
|
||||
u32 code_point { 0 };
|
||||
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
|
||||
size_t decomposition_index { 0 };
|
||||
size_t decomposition_count { 0 };
|
||||
};
|
||||
|
||||
struct CodePointDecomposition {
|
||||
u32 code_point { 0 };
|
||||
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
|
||||
ReadonlySpan<u32> decomposition;
|
||||
};
|
||||
|
||||
struct CodePointCompositionRaw {
|
||||
u32 code_point { 0 };
|
||||
u32 second_code_point { 0 };
|
||||
u32 combined_code_point { 0 };
|
||||
};
|
||||
|
||||
Optional<Locale> locale_from_string(StringView locale);
|
||||
|
||||
ReadonlySpan<SpecialCasing> special_case_mapping(u32 code_point);
|
||||
|
@ -875,96 +782,6 @@ struct CodePointBidiClassComparator : public CodePointRangeComparator {
|
|||
}
|
||||
};
|
||||
|
||||
)~~~");
|
||||
|
||||
generator.set("decomposition_mappings_size", ByteString::number(unicode_data.decomposition_mappings.size()));
|
||||
generator.append("\nstatic constexpr Array<u32, @decomposition_mappings_size@> s_decomposition_mappings_data { ");
|
||||
generator.append(ByteString::join(", "sv, unicode_data.decomposition_mappings, "{:#x}"sv));
|
||||
generator.append(" };\n");
|
||||
|
||||
auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) {
|
||||
generator.set("name", name);
|
||||
generator.set("mapping_type", mapping_type);
|
||||
generator.set("size", ByteString::number(size));
|
||||
|
||||
generator.append(R"~~~(
|
||||
static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
||||
)~~~");
|
||||
|
||||
constexpr size_t max_mappings_per_row = 20;
|
||||
size_t mappings_in_current_row = 0;
|
||||
|
||||
for (auto const& data : unicode_data.code_point_data) {
|
||||
auto mapping = mapping_getter(data);
|
||||
|
||||
if constexpr (requires { mapping.has_value(); }) {
|
||||
if (!mapping.has_value())
|
||||
continue;
|
||||
} else {
|
||||
if (mapping.is_empty())
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mappings_in_current_row++ > 0)
|
||||
generator.append(" ");
|
||||
|
||||
generator.set("code_point", ByteString::formatted("{:#x}", data.code_point));
|
||||
generator.append("{ @code_point@");
|
||||
|
||||
if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<size_t>>) {
|
||||
generator.set("mapping", ByteString::formatted("{:#x}", *mapping));
|
||||
generator.append(", @mapping@ },");
|
||||
} else if constexpr (IsSame<decltype(mapping), Optional<CodePointDecomposition>>) {
|
||||
generator.set("tag", mapping->tag);
|
||||
generator.set("start", ByteString::number(mapping->decomposition_index));
|
||||
generator.set("size", ByteString::number(mapping->decomposition_size));
|
||||
generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },");
|
||||
} else {
|
||||
append_list_and_size(mapping, "&s_@name@[{}]"sv);
|
||||
generator.append(" },");
|
||||
}
|
||||
|
||||
if (mappings_in_current_row == max_mappings_per_row) {
|
||||
mappings_in_current_row = 0;
|
||||
generator.append("\n ");
|
||||
}
|
||||
}
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
)~~~");
|
||||
};
|
||||
|
||||
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; });
|
||||
|
||||
size_t composition_mappings_size = 0;
|
||||
for (auto const& entry : unicode_data.composition_mappings)
|
||||
composition_mappings_size += entry.value.size();
|
||||
generator.set("composition_mappings_size", ByteString::number(composition_mappings_size));
|
||||
generator.append(R"~~~(
|
||||
static constexpr Array<CodePointCompositionRaw, @composition_mappings_size@> s_composition_mappings { {
|
||||
)~~~");
|
||||
constexpr size_t max_mappings_per_row = 40;
|
||||
size_t mappings_in_current_row = 0;
|
||||
auto first_code_points = unicode_data.composition_mappings.keys();
|
||||
quick_sort(first_code_points);
|
||||
for (auto const first_code_point : first_code_points) {
|
||||
for (auto const& mapping : unicode_data.composition_mappings.find(first_code_point)->value) {
|
||||
if (mappings_in_current_row++ > 0)
|
||||
generator.append(" ");
|
||||
|
||||
generator.set("code_point", ByteString::formatted("{:#x}", first_code_point));
|
||||
generator.set("second_code_point", ByteString::formatted("{:#x}", mapping.second_code_point));
|
||||
generator.set("combined_code_point", ByteString::formatted("{:#x}", mapping.combined_code_point));
|
||||
generator.append("{ @code_point@, @second_code_point@, @combined_code_point@ },");
|
||||
|
||||
if (mappings_in_current_row == max_mappings_per_row) {
|
||||
mappings_in_current_row = 0;
|
||||
generator.append("\n ");
|
||||
}
|
||||
}
|
||||
}
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
)~~~");
|
||||
|
||||
auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> {
|
||||
|
@ -1167,28 +984,6 @@ ReadonlySpan<CaseFolding> case_folding_mapping(u32 code_point)
|
|||
return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size);
|
||||
}
|
||||
|
||||
Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point)
|
||||
{
|
||||
auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator<CodePointDecompositionRaw> {});
|
||||
if (mapping == nullptr)
|
||||
return {};
|
||||
return CodePointDecomposition { mapping->code_point, mapping->tag, ReadonlySpan<u32> { s_decomposition_mappings_data.data() + mapping->decomposition_index, mapping->decomposition_count } };
|
||||
}
|
||||
|
||||
Optional<u32> code_point_composition(u32 first_code_point, u32 second_code_point)
|
||||
{
|
||||
size_t mapping_index;
|
||||
if (!binary_search(s_composition_mappings, first_code_point, &mapping_index, CodePointComparator<CodePointCompositionRaw> {}))
|
||||
return {};
|
||||
while (mapping_index > 0 && s_composition_mappings[mapping_index - 1].code_point == first_code_point)
|
||||
mapping_index--;
|
||||
for (; mapping_index < s_composition_mappings.size() && s_composition_mappings[mapping_index].code_point == first_code_point; ++mapping_index) {
|
||||
if (s_composition_mappings[mapping_index].second_code_point == second_code_point)
|
||||
return s_composition_mappings[mapping_index].combined_code_point;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
Optional<BidirectionalClass> bidirectional_class(u32 code_point)
|
||||
{
|
||||
if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {}))
|
||||
|
|
|
@ -16,4 +16,8 @@ set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED})
|
|||
|
||||
serenity_lib(LibUnicode unicode)
|
||||
|
||||
find_package(ICU REQUIRED COMPONENTS data uc)
|
||||
target_include_directories(LibUnicode PRIVATE ${ICU_INCLUDE_DIRS})
|
||||
target_link_libraries(LibUnicode PUBLIC ${ICU_LIBRARIES})
|
||||
|
||||
target_compile_definitions(LibUnicode PRIVATE ENABLE_UNICODE_DATA=$<BOOL:${ENABLE_UNICODE_DATABASE_DOWNLOAD}>)
|
||||
|
|
|
@ -1,27 +1,20 @@
|
|||
/*
|
||||
* Copyright (c) 2022, mat
|
||||
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Find.h>
|
||||
#include <AK/QuickSort.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#define AK_DONT_REPLACE_STD
|
||||
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <LibLocale/ICU.h>
|
||||
#include <LibUnicode/Normalize.h>
|
||||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
# include <LibUnicode/UnicodeData.h>
|
||||
#else
|
||||
struct Unicode::CodePointDecomposition { };
|
||||
#endif
|
||||
#include <unicode/normalizer2.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
||||
Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
|
||||
Optional<u32> __attribute__((weak)) code_point_composition(u32, u32) { return {}; }
|
||||
|
||||
NormalizationForm normalization_form_from_string(StringView form)
|
||||
{
|
||||
if (form == "NFD"sv)
|
||||
|
@ -50,260 +43,37 @@ StringView normalization_form_to_string(NormalizationForm form)
|
|||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static bool is_starter(u32 code_point)
|
||||
{
|
||||
return Unicode::canonical_combining_class(code_point) == 0;
|
||||
}
|
||||
|
||||
// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
|
||||
static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
|
||||
static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
|
||||
static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
|
||||
static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
|
||||
static constexpr u32 HANGUL_LEADING_COUNT = 19;
|
||||
static constexpr u32 HANGUL_VOWEL_COUNT = 21;
|
||||
static constexpr u32 HANGUL_TRAILING_COUNT = 28;
|
||||
// NCount in the standard.
|
||||
static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
|
||||
static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;
|
||||
|
||||
ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
|
||||
{
|
||||
return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
|
||||
{
|
||||
return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
|
||||
{
|
||||
return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
|
||||
{
|
||||
return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
|
||||
}
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
|
||||
static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
|
||||
{
|
||||
auto const index = code_point - HANGUL_SYLLABLE_BASE;
|
||||
|
||||
auto const leading_index = index / HANGUL_BLOCK_COUNT;
|
||||
auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
|
||||
auto const trailing_index = index % HANGUL_TRAILING_COUNT;
|
||||
|
||||
auto const leading_part = HANGUL_LEADING_BASE + leading_index;
|
||||
auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
|
||||
auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
|
||||
|
||||
code_points_output.append(leading_part);
|
||||
code_points_output.append(vowel_part);
|
||||
if (trailing_index != 0)
|
||||
code_points_output.append(trailing_part);
|
||||
}
|
||||
|
||||
// L, V and LV, T Hangul Syllable Composition
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
|
||||
static u32 combine_hangul_code_points(u32 a, u32 b)
|
||||
{
|
||||
if (is_hangul_leading(a) && is_hangul_vowel(b)) {
|
||||
auto const leading_index = a - HANGUL_LEADING_BASE;
|
||||
auto const vowel_index = b - HANGUL_VOWEL_BASE;
|
||||
auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
|
||||
return HANGUL_SYLLABLE_BASE + leading_vowel_index;
|
||||
}
|
||||
// LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
|
||||
if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
|
||||
return a + b - HANGUL_TRAILING_BASE;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
auto composition = code_point_composition(a, b);
|
||||
if (composition.has_value())
|
||||
return composition.value();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum class UseCompatibility {
|
||||
Yes,
|
||||
No
|
||||
};
|
||||
|
||||
static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
|
||||
{
|
||||
if (is_hangul_code_point(code_point))
|
||||
return decompose_hangul_code_point(code_point, code_points_output);
|
||||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
auto const mapping = Unicode::code_point_decomposition(code_point);
|
||||
if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
|
||||
for (auto code_point : mapping->decomposition) {
|
||||
decompose_code_point(code_point, code_points_output, use_compatibility);
|
||||
}
|
||||
} else {
|
||||
code_points_output.append(code_point);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// This can be any sorting algorithm that maintains order (like std::stable_sort),
|
||||
// however bubble sort is easier to implement, so go with it (for now).
|
||||
template<typename T, typename LessThan>
|
||||
void bubble_sort(Span<T> span, LessThan less_than)
|
||||
{
|
||||
for (size_t i = 0; i < span.size() - 1; ++i) {
|
||||
for (size_t j = 0; j < span.size() - 1 - i; ++j) {
|
||||
if (!less_than(span[j], span[j + 1]))
|
||||
swap(span[j], span[j + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
|
||||
// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
|
||||
static void canonical_ordering_algorithm(Span<u32> code_points)
|
||||
{
|
||||
for (size_t i = 0; i < code_points.size(); ++i) {
|
||||
if (!is_starter(code_points[i])) {
|
||||
auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
|
||||
auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
|
||||
// Nothing to reorder, so continue.
|
||||
if (span_size <= 1)
|
||||
continue;
|
||||
Span<u32> const span { code_points.data() + i, span_size };
|
||||
|
||||
bubble_sort(span, [](u32 a, u32 b) {
|
||||
// Use <= to keep ordering.
|
||||
return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
|
||||
});
|
||||
|
||||
// Skip over span we just sorted.
|
||||
i += span_size - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
|
||||
static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
|
||||
{
|
||||
if (a == c - 1)
|
||||
return false;
|
||||
auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
|
||||
auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
|
||||
return b_combining_class >= c_combining_class;
|
||||
}
|
||||
|
||||
// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
|
||||
// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
|
||||
static void canonical_composition_algorithm(Vector<u32>& code_points)
|
||||
{
|
||||
if (code_points.size() <= 1)
|
||||
return;
|
||||
ssize_t last_starter = is_starter(code_points[0]) ? 0 : -1;
|
||||
for (size_t i = 1; i < code_points.size(); ++i) {
|
||||
auto const current_character = code_points[i];
|
||||
// R1. Seek back (left) to find the last Starter L preceding C in the character sequence
|
||||
if (last_starter == -1) {
|
||||
if (is_starter(current_character))
|
||||
last_starter = i;
|
||||
continue;
|
||||
}
|
||||
// R2. If there is such an L, and C is not blocked from L,
|
||||
// and there exists a Primary Composite P which is canonically equivalent to <L, C>,
|
||||
// then replace L by P in the sequence and delete C from the sequence.
|
||||
if (is_blocked(code_points.span(), last_starter, i)) {
|
||||
if (is_starter(current_character))
|
||||
last_starter = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto composite = combine_hangul_code_points(code_points[last_starter], current_character);
|
||||
|
||||
if (composite == 0)
|
||||
composite = combine_code_points(code_points[last_starter], current_character);
|
||||
|
||||
if (composite == 0) {
|
||||
if (is_starter(current_character))
|
||||
last_starter = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
code_points[last_starter] = composite;
|
||||
code_points.remove(i);
|
||||
--i;
|
||||
}
|
||||
}
|
||||
|
||||
static Vector<u32> normalize_nfd(Utf8View string)
|
||||
{
|
||||
Vector<u32> result;
|
||||
for (auto const code_point : string)
|
||||
decompose_code_point(code_point, result, UseCompatibility::No);
|
||||
|
||||
canonical_ordering_algorithm(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
static Vector<u32> normalize_nfc(Utf8View string)
|
||||
{
|
||||
auto result = normalize_nfd(string);
|
||||
canonical_composition_algorithm(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static Vector<u32> normalize_nfkd(Utf8View string)
|
||||
{
|
||||
Vector<u32> result;
|
||||
for (auto const code_point : string)
|
||||
decompose_code_point(code_point, result, UseCompatibility::Yes);
|
||||
|
||||
canonical_ordering_algorithm(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
static Vector<u32> normalize_nfkc(Utf8View string)
|
||||
{
|
||||
auto result = normalize_nfkd(string);
|
||||
canonical_composition_algorithm(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
|
||||
{
|
||||
switch (form) {
|
||||
case NormalizationForm::NFD:
|
||||
return normalize_nfd(string);
|
||||
case NormalizationForm::NFC:
|
||||
return normalize_nfc(string);
|
||||
case NormalizationForm::NFKD:
|
||||
return normalize_nfkd(string);
|
||||
case NormalizationForm::NFKC:
|
||||
return normalize_nfkc(string);
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
String normalize(StringView string, NormalizationForm form)
|
||||
{
|
||||
auto const code_points = normalize_implementation(Utf8View { string }, form);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
icu::Normalizer2 const* normalizer = nullptr;
|
||||
|
||||
StringBuilder builder;
|
||||
for (auto code_point : code_points)
|
||||
builder.append_code_point(code_point);
|
||||
switch (form) {
|
||||
case NormalizationForm::NFD:
|
||||
normalizer = icu::Normalizer2::getNFDInstance(status);
|
||||
break;
|
||||
case NormalizationForm::NFC:
|
||||
normalizer = icu::Normalizer2::getNFCInstance(status);
|
||||
break;
|
||||
case NormalizationForm::NFKD:
|
||||
normalizer = icu::Normalizer2::getNFKDInstance(status);
|
||||
break;
|
||||
case NormalizationForm::NFKC:
|
||||
normalizer = icu::Normalizer2::getNFKCInstance(status);
|
||||
break;
|
||||
}
|
||||
|
||||
if (Locale::icu_failure(status))
|
||||
return MUST(String::from_utf8(string));
|
||||
|
||||
VERIFY(normalizer);
|
||||
|
||||
StringBuilder builder { string.length() };
|
||||
icu::StringByteSink sink { &builder };
|
||||
|
||||
normalizer->normalizeUTF8(0, Locale::icu_string_piece(string), sink, nullptr, status);
|
||||
if (Locale::icu_failure(status))
|
||||
return MUST(String::from_utf8(string));
|
||||
|
||||
return MUST(builder.to_string());
|
||||
}
|
||||
|
|
|
@ -1,33 +1,25 @@
|
|||
/*
|
||||
* Copyright (c) 2022, mat
|
||||
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Error.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/Span.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
||||
Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point);
|
||||
Optional<u32> code_point_composition(u32 first_code_point, u32 second_code_point);
|
||||
|
||||
enum class NormalizationForm {
|
||||
NFD,
|
||||
NFC,
|
||||
NFKD,
|
||||
NFKC
|
||||
};
|
||||
|
||||
NormalizationForm normalization_form_from_string(StringView form);
|
||||
StringView normalization_form_to_string(NormalizationForm form);
|
||||
NormalizationForm normalization_form_from_string(StringView);
|
||||
StringView normalization_form_to_string(NormalizationForm);
|
||||
|
||||
String normalize(StringView string, NormalizationForm form);
|
||||
|
||||
|
|
Loading…
Reference in a new issue