0ct0pu5
/
ladybird


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
							/*
 * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Platform.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>

#if ENABLE_UNICODE_DATA
#    include <LibUnicode/UnicodeData.h>
#else
#    include <AK/CharacterTypes.h>
#endif

// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf

namespace Unicode {

#if ENABLE_UNICODE_DATA

static bool has_property(UnicodeData const& unicode_data, Property property)
{
    return (unicode_data.properties & property) == property;
}

static bool is_cased_letter(UnicodeData const& unicode_data)
{
    // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property
    // or has a General_Category value of Titlecase_Letter.
    switch (unicode_data.general_category) {
    case GeneralCategory::Ll:
    case GeneralCategory::Lu:
    case GeneralCategory::Lt:
        return true;
    default:
        break;
    }

    return has_property(unicode_data, Property::OtherLowercase) || has_property(unicode_data, Property::OtherUppercase);
}

static bool is_case_ignorable(UnicodeData const& unicode_data)
{
    // A character C is defined to be case-ignorable if C has the value MidLetter (ML),
    // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is
    // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
    // Modifier_Symbol (Sk).
    switch (unicode_data.general_category) {
    case GeneralCategory::Mn:
    case GeneralCategory::Me:
    case GeneralCategory::Cf:
    case GeneralCategory::Lm:
    case GeneralCategory::Sk:
        return true;
    default:
        break;
    }

    switch (unicode_data.word_break_property) {
    case WordBreakProperty::MidLetter:
    case WordBreakProperty::MidNumLet:
    case WordBreakProperty::SingleQuote:
        return true;
    default:
        break;
    }

    return false;
}

static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
{
    // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
    // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
    // characters and then a cased letter.
    auto preceding_view = string.substring_view(0, index);
    auto following_view = ((index + byte_length) < string.byte_length())
        ? string.substring_view(index + byte_length)
        : Utf8View {};

    size_t cased_letter_count = 0;

    for (auto code_point : preceding_view) {
        auto unicode_data = unicode_data_for_code_point(code_point);
        if (!unicode_data.has_value())
            return false;

        if (is_cased_letter(*unicode_data) && !is_case_ignorable(*unicode_data))
            ++cased_letter_count;
        else if (!is_case_ignorable(*unicode_data))
            cased_letter_count = 0;
    }

    if (cased_letter_count == 0)
        return false;

    for (auto code_point : following_view) {
        auto unicode_data = unicode_data_for_code_point(code_point);
        if (!unicode_data.has_value())
            return false;

        if (is_case_ignorable(*unicode_data))
            continue;
        if (is_cased_letter(*unicode_data))
            return false;

        break;
    }

    return true;
}

static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
{
    for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
        auto const* special_casing = unicode_data.special_casing[i];

        if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
            return special_casing;

        // FIXME: Handle locale.
        if (special_casing->locale != Locale::None)
            continue;

        switch (special_casing->condition) {
        case Condition::FinalSigma:
            if (is_final_code_point(string, index, byte_length))
                return special_casing;
            break;

        default:
            break;
        }
    }

    return nullptr;
}

#endif

u32 to_unicode_lowercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
    auto unicode_data = unicode_data_for_code_point(code_point);
    if (unicode_data.has_value())
        return unicode_data->simple_lowercase_mapping;
    return code_point;
#else
    return AK::to_ascii_lowercase(code_point);
#endif
}

u32 to_unicode_uppercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
    auto unicode_data = unicode_data_for_code_point(code_point);
    if (unicode_data.has_value())
        return unicode_data->simple_uppercase_mapping;
    return code_point;
#else
    return AK::to_ascii_uppercase(code_point);
#endif
}

String to_unicode_lowercase_full(StringView const& string)
{
#if ENABLE_UNICODE_DATA
    Utf8View view { string };
    StringBuilder builder;

    size_t index = 0;
    for (auto it = view.begin(); it != view.end(); ++it) {
        u32 code_point = *it;
        size_t byte_length = it.underlying_code_point_length_in_bytes();

        auto unicode_data = unicode_data_for_code_point(code_point);
        if (!unicode_data.has_value()) {
            builder.append_code_point(code_point);
            index += byte_length;
            continue;
        }

        auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
        if (!special_casing) {
            builder.append_code_point(unicode_data->simple_lowercase_mapping);
            index += byte_length;
            continue;
        }

        for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
            builder.append_code_point(special_casing->lowercase_mapping[i]);
    }

    return builder.build();
#else
    return string.to_lowercase_string();
#endif
}

String to_unicode_uppercase_full(StringView const& string)
{
#if ENABLE_UNICODE_DATA
    Utf8View view { string };
    StringBuilder builder;

    size_t index = 0;
    for (auto it = view.begin(); it != view.end(); ++it) {
        u32 code_point = *it;
        size_t byte_length = it.underlying_code_point_length_in_bytes();

        auto unicode_data = unicode_data_for_code_point(code_point);
        if (!unicode_data.has_value()) {
            builder.append_code_point(code_point);
            index += byte_length;
            continue;
        }

        auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
        if (!special_casing) {
            builder.append_code_point(unicode_data->simple_uppercase_mapping);
            index += byte_length;
            continue;
        }

        for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
            builder.append_code_point(special_casing->uppercase_mapping[i]);
    }

    return builder.build();
#else
    return string.to_uppercase_string();
#endif
}

}