mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 07:30:19 +00:00
LibUnicode: Replace case transformations and comparison with ICUs
There are a couple of differences here due to using ICU: 1. Titlecasing behaves slightly differently. We previously transformed "123dollars" to "123Dollars", as we would use word segmentation to split a string into words, then transform the first cased character to titlecase. ICU doesn't go quite that far, and leaves the string as "123dollars". While this is a behavior change, the only user of this API is the `text-transform: capitalize;` CSS rule, and we now match the behavior of other browsers. 2. There isn't an API to compare strings with case insensitivity without allocating case-folded strings for both the left- and right-hand-side strings. Our implementation was previously allocation-free; however, in a benchmark, ICU is still ~1.4x faster.
This commit is contained in:
parent
a3a7a65b1c
commit
5cf818e305
Notes:
sideshowbarker
2024-07-16 23:17:55 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/5cf818e305 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/219
12 changed files with 111 additions and 1092 deletions
|
@ -13,12 +13,6 @@ set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip")
|
|||
set(UNICODE_DATA_SOURCE "UnicodeData.txt")
|
||||
set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}")
|
||||
|
||||
set(SPECIAL_CASING_SOURCE "SpecialCasing.txt")
|
||||
set(SPECIAL_CASING_PATH "${UCD_PATH}/${SPECIAL_CASING_SOURCE}")
|
||||
|
||||
set(CASE_FOLDING_SOURCE "CaseFolding.txt")
|
||||
set(CASE_FOLDING_PATH "${UCD_PATH}/${CASE_FOLDING_SOURCE}")
|
||||
|
||||
set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt")
|
||||
set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}")
|
||||
|
||||
|
@ -72,8 +66,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
if (ENABLE_NETWORK_DOWNLOADS)
|
||||
download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SPECIAL_CASING_SOURCE}" "${SPECIAL_CASING_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${CASE_FOLDING_SOURCE}" "${CASE_FOLDING_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}")
|
||||
|
@ -111,7 +103,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
"${UCD_VERSION_FILE}"
|
||||
"${UNICODE_DATA_HEADER}"
|
||||
"${UNICODE_DATA_IMPLEMENTATION}"
|
||||
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
|
||||
arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
|
||||
)
|
||||
invoke_generator(
|
||||
"EmojiData"
|
||||
|
|
|
@ -21,24 +21,6 @@
|
|||
#include <LibCore/ArgsParser.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#SpecialCasing.txt
|
||||
struct SpecialCasing {
|
||||
u32 index { 0 };
|
||||
u32 code_point { 0 };
|
||||
Vector<u32> lowercase_mapping;
|
||||
Vector<u32> uppercase_mapping;
|
||||
Vector<u32> titlecase_mapping;
|
||||
ByteString locale;
|
||||
ByteString condition;
|
||||
};
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
|
||||
struct CaseFolding {
|
||||
u32 code_point { 0 };
|
||||
StringView status { "Common"sv };
|
||||
Vector<u32> mapping { 0 };
|
||||
};
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#PropList.txt
|
||||
using PropList = HashMap<ByteString, Vector<Unicode::CodePointRange>>;
|
||||
|
||||
|
@ -57,25 +39,6 @@ struct Normalization {
|
|||
|
||||
using NormalizationProps = HashMap<ByteString, Vector<Normalization>>;
|
||||
|
||||
struct CasingTable {
|
||||
bool operator==(CasingTable const& other) const
|
||||
{
|
||||
return canonical_combining_class == other.canonical_combining_class
|
||||
&& simple_lowercase_mapping == other.simple_lowercase_mapping
|
||||
&& simple_uppercase_mapping == other.simple_uppercase_mapping
|
||||
&& simple_titlecase_mapping == other.simple_titlecase_mapping
|
||||
&& special_casing_indices == other.special_casing_indices
|
||||
&& case_folding_indices == other.case_folding_indices;
|
||||
}
|
||||
|
||||
u8 canonical_combining_class { 0 };
|
||||
Optional<u32> simple_uppercase_mapping;
|
||||
Optional<u32> simple_lowercase_mapping;
|
||||
Optional<u32> simple_titlecase_mapping;
|
||||
Vector<u32> special_casing_indices;
|
||||
Vector<u32> case_folding_indices;
|
||||
};
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#UnicodeData.txt
|
||||
struct CodePointData {
|
||||
u32 code_point { 0 };
|
||||
|
@ -87,7 +50,6 @@ struct CodePointData {
|
|||
bool bidi_mirrored { false };
|
||||
ByteString unicode_1_name;
|
||||
ByteString iso_comment;
|
||||
CasingTable casing;
|
||||
};
|
||||
|
||||
using PropertyTable = Vector<bool>;
|
||||
|
@ -111,15 +73,6 @@ struct CodePointBidiClass {
|
|||
};
|
||||
|
||||
struct UnicodeData {
|
||||
Vector<SpecialCasing> special_casing;
|
||||
u32 largest_special_casing_mapping_size { 0 };
|
||||
Vector<ByteString> conditions;
|
||||
Vector<ByteString> locales;
|
||||
|
||||
Vector<CaseFolding> case_folding;
|
||||
u32 largest_case_folding_mapping_size { 0 };
|
||||
Vector<StringView> statuses;
|
||||
|
||||
Vector<CodePointData> code_point_data;
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
|
@ -149,7 +102,6 @@ struct UnicodeData {
|
|||
PropList word_break_props;
|
||||
PropList sentence_break_props;
|
||||
|
||||
CodePointTables<CasingTable> casing_tables;
|
||||
CodePointTables<PropertyTable> general_category_tables;
|
||||
CodePointTables<PropertyTable> property_tables;
|
||||
CodePointTables<PropertyTable> script_tables;
|
||||
|
@ -180,125 +132,6 @@ static ByteString sanitize_entry(ByteString const& entry)
|
|||
return builder.to_byte_string();
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_special_casing(Core::InputBufferedFile& file, UnicodeData& unicode_data)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
||||
while (TRY(file.can_read_line())) {
|
||||
auto line = TRY(file.read_line(buffer));
|
||||
|
||||
if (line.is_empty() || line.starts_with('#'))
|
||||
continue;
|
||||
|
||||
if (auto index = line.find('#'); index.has_value())
|
||||
line = line.substring_view(0, *index);
|
||||
|
||||
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
|
||||
VERIFY(segments.size() == 5 || segments.size() == 6);
|
||||
|
||||
SpecialCasing casing {};
|
||||
casing.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
|
||||
casing.lowercase_mapping = parse_code_point_list(segments[1]);
|
||||
casing.titlecase_mapping = parse_code_point_list(segments[2]);
|
||||
casing.uppercase_mapping = parse_code_point_list(segments[3]);
|
||||
|
||||
if (auto condition = segments[4].trim_whitespace(); !condition.is_empty()) {
|
||||
auto conditions = condition.split_view(' ', SplitBehavior::KeepEmpty);
|
||||
VERIFY(conditions.size() == 1 || conditions.size() == 2);
|
||||
|
||||
if (conditions.size() == 2) {
|
||||
casing.locale = conditions[0];
|
||||
casing.condition = conditions[1];
|
||||
} else if (all_of(conditions[0], is_ascii_lower_alpha)) {
|
||||
casing.locale = conditions[0];
|
||||
} else {
|
||||
casing.condition = conditions[0];
|
||||
}
|
||||
|
||||
if (!casing.locale.is_empty()) {
|
||||
casing.locale = ByteString::formatted("{:c}{}", to_ascii_uppercase(casing.locale[0]), casing.locale.substring_view(1));
|
||||
|
||||
if (!unicode_data.locales.contains_slow(casing.locale))
|
||||
unicode_data.locales.append(casing.locale);
|
||||
}
|
||||
|
||||
casing.condition = casing.condition.replace("_"sv, ""sv, ReplaceMode::All);
|
||||
|
||||
if (!casing.condition.is_empty() && !unicode_data.conditions.contains_slow(casing.condition))
|
||||
unicode_data.conditions.append(casing.condition);
|
||||
}
|
||||
|
||||
unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.lowercase_mapping.size());
|
||||
unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.titlecase_mapping.size());
|
||||
unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.uppercase_mapping.size());
|
||||
|
||||
unicode_data.special_casing.append(move(casing));
|
||||
}
|
||||
|
||||
quick_sort(unicode_data.special_casing, [](auto const& lhs, auto const& rhs) {
|
||||
if (lhs.code_point != rhs.code_point)
|
||||
return lhs.code_point < rhs.code_point;
|
||||
if (lhs.locale.is_empty() && !rhs.locale.is_empty())
|
||||
return false;
|
||||
if (!lhs.locale.is_empty() && rhs.locale.is_empty())
|
||||
return true;
|
||||
return lhs.locale < rhs.locale;
|
||||
});
|
||||
|
||||
for (u32 i = 0; i < unicode_data.special_casing.size(); ++i)
|
||||
unicode_data.special_casing[i].index = i;
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_case_folding(Core::InputBufferedFile& file, UnicodeData& unicode_data)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
||||
while (TRY(file.can_read_line())) {
|
||||
auto line = TRY(file.read_line(buffer));
|
||||
if (line.is_empty() || line.starts_with('#'))
|
||||
continue;
|
||||
|
||||
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
|
||||
VERIFY(segments.size() == 4);
|
||||
|
||||
CaseFolding folding {};
|
||||
folding.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
|
||||
folding.mapping = parse_code_point_list(segments[2]);
|
||||
|
||||
switch (segments[1].trim_whitespace()[0]) {
|
||||
case 'C':
|
||||
folding.status = "Common"sv;
|
||||
break;
|
||||
case 'F':
|
||||
folding.status = "Full"sv;
|
||||
break;
|
||||
case 'S':
|
||||
folding.status = "Simple"sv;
|
||||
break;
|
||||
case 'T':
|
||||
folding.status = "Special"sv;
|
||||
break;
|
||||
}
|
||||
|
||||
unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size());
|
||||
|
||||
if (!unicode_data.statuses.contains_slow(folding.status))
|
||||
unicode_data.statuses.append(folding.status);
|
||||
|
||||
unicode_data.case_folding.append(move(folding));
|
||||
}
|
||||
|
||||
quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) {
|
||||
if (lhs.code_point != rhs.code_point)
|
||||
return lhs.code_point < rhs.code_point;
|
||||
return lhs.status < rhs.status;
|
||||
});
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_prop_list(Core::InputBufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
@ -503,7 +336,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
|
|||
CodePointData data {};
|
||||
data.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
|
||||
data.name = segments[1];
|
||||
data.casing.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value();
|
||||
data.bidi_class = segments[4];
|
||||
data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
|
||||
data.numeric_value_digit = AK::StringUtils::convert_to_int<i8>(segments[7]);
|
||||
|
@ -511,9 +343,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
|
|||
data.bidi_mirrored = segments[9] == "Y"sv;
|
||||
data.unicode_1_name = segments[10];
|
||||
data.iso_comment = segments[11];
|
||||
data.casing.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[12]);
|
||||
data.casing.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
|
||||
data.casing.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
|
||||
|
||||
if (!assigned_code_point_range_start.has_value())
|
||||
assigned_code_point_range_start = data.code_point;
|
||||
|
@ -547,16 +376,6 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
|
|||
}
|
||||
}
|
||||
|
||||
for (auto const& casing : unicode_data.special_casing) {
|
||||
if (casing.code_point == data.code_point)
|
||||
data.casing.special_casing_indices.append(casing.index);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) {
|
||||
if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point)
|
||||
data.casing.case_folding_indices.append(i);
|
||||
}
|
||||
|
||||
unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
|
||||
|
||||
previous_code_point = data.code_point;
|
||||
|
@ -570,8 +389,6 @@ static ErrorOr<void> generate_unicode_data_header(Core::InputBufferedFile& file,
|
|||
{
|
||||
StringBuilder builder;
|
||||
SourceGenerator generator { builder };
|
||||
generator.set("special_casing_mapping_size", ByteString::number(unicode_data.largest_special_casing_mapping_size));
|
||||
generator.set("case_folding_mapping_size", ByteString::number(unicode_data.largest_case_folding_mapping_size));
|
||||
|
||||
auto generate_enum = [&](StringView name, StringView default_, auto values, Vector<Alias> aliases = {}) {
|
||||
quick_sort(values);
|
||||
|
@ -619,9 +436,6 @@ enum class @name@ : @underlying@ {)~~~");
|
|||
namespace Unicode {
|
||||
)~~~");
|
||||
|
||||
generate_enum("Locale"sv, "None"sv, unicode_data.locales);
|
||||
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
|
||||
generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses));
|
||||
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
|
||||
generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
|
||||
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
|
||||
|
@ -631,35 +445,6 @@ namespace Unicode {
|
|||
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
|
||||
|
||||
generator.append(R"~~~(
|
||||
struct SpecialCasing {
|
||||
u32 code_point { 0 };
|
||||
|
||||
u32 lowercase_mapping[@special_casing_mapping_size@];
|
||||
u32 lowercase_mapping_size { 0 };
|
||||
|
||||
u32 uppercase_mapping[@special_casing_mapping_size@];
|
||||
u32 uppercase_mapping_size { 0 };
|
||||
|
||||
u32 titlecase_mapping[@special_casing_mapping_size@];
|
||||
u32 titlecase_mapping_size { 0 };
|
||||
|
||||
Locale locale { Locale::None };
|
||||
Condition condition { Condition::None };
|
||||
};
|
||||
|
||||
struct CaseFolding {
|
||||
u32 code_point { 0 };
|
||||
CaseFoldingStatus status { CaseFoldingStatus::Common };
|
||||
|
||||
u32 mapping[@case_folding_mapping_size@];
|
||||
u32 mapping_size { 0 };
|
||||
};
|
||||
|
||||
Optional<Locale> locale_from_string(StringView locale);
|
||||
|
||||
ReadonlySpan<SpecialCasing> special_case_mapping(u32 code_point);
|
||||
ReadonlySpan<CaseFolding> case_folding_mapping(u32 code_point);
|
||||
|
||||
}
|
||||
)~~~");
|
||||
|
||||
|
@ -672,9 +457,6 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFil
|
|||
StringBuilder builder;
|
||||
SourceGenerator generator { builder };
|
||||
|
||||
generator.set("special_casing_size", ByteString::number(unicode_data.special_casing.size()));
|
||||
generator.set("case_folding_size", ByteString::number(unicode_data.case_folding.size()));
|
||||
|
||||
generator.set("CODE_POINT_TABLES_LSB_COUNT", TRY(String::number(CODE_POINT_TABLES_LSB_COUNT)));
|
||||
generator.set("CODE_POINT_TABLES_LSB_MASK", TRY(String::formatted("{:#x}", CODE_POINT_TABLES_LSB_MASK)));
|
||||
|
||||
|
@ -693,83 +475,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFil
|
|||
namespace Unicode {
|
||||
)~~~");
|
||||
|
||||
auto append_list_and_size = [&](auto const& list, StringView format) {
|
||||
if (list.is_empty()) {
|
||||
generator.append(", {}, 0");
|
||||
return;
|
||||
}
|
||||
|
||||
bool first = true;
|
||||
generator.append(", {");
|
||||
for (auto const& item : list) {
|
||||
generator.append(first ? " "sv : ", "sv);
|
||||
generator.append(ByteString::formatted(format, item));
|
||||
first = false;
|
||||
}
|
||||
generator.append(ByteString::formatted(" }}, {}", list.size()));
|
||||
};
|
||||
|
||||
generator.append(R"~~~(
|
||||
static constexpr Array<SpecialCasing, @special_casing_size@> s_special_case { {)~~~");
|
||||
|
||||
for (auto const& casing : unicode_data.special_casing) {
|
||||
generator.set("code_point", ByteString::formatted("{:#x}", casing.code_point));
|
||||
generator.append(R"~~~(
|
||||
{ @code_point@)~~~");
|
||||
|
||||
constexpr auto format = "{:#x}"sv;
|
||||
append_list_and_size(casing.lowercase_mapping, format);
|
||||
append_list_and_size(casing.uppercase_mapping, format);
|
||||
append_list_and_size(casing.titlecase_mapping, format);
|
||||
|
||||
generator.set("locale", casing.locale.is_empty() ? "None" : casing.locale);
|
||||
generator.append(", Locale::@locale@");
|
||||
|
||||
generator.set("condition", casing.condition.is_empty() ? "None" : casing.condition);
|
||||
generator.append(", Condition::@condition@");
|
||||
|
||||
generator.append(" },");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
|
||||
static constexpr Array<CaseFolding, @case_folding_size@> s_case_folding { {)~~~");
|
||||
|
||||
for (auto const& folding : unicode_data.case_folding) {
|
||||
generator.set("code_point", ByteString::formatted("{:#x}", folding.code_point));
|
||||
generator.set("status", folding.status);
|
||||
generator.append(R"~~~(
|
||||
{ @code_point@, CaseFoldingStatus::@status@)~~~");
|
||||
|
||||
append_list_and_size(folding.mapping, "{:#x}"sv);
|
||||
generator.append(" },");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
|
||||
struct CasingTable {
|
||||
u8 canonical_combining_class { 0 };
|
||||
i32 simple_uppercase_mapping { -1 };
|
||||
i32 simple_lowercase_mapping { -1 };
|
||||
i32 simple_titlecase_mapping { -1 };
|
||||
|
||||
u32 special_casing_start_index { 0 };
|
||||
u32 special_casing_size { 0 };
|
||||
|
||||
u32 case_folding_start_index { 0 };
|
||||
u32 case_folding_size { 0 };
|
||||
};
|
||||
|
||||
template<typename MappingType>
|
||||
struct CodePointComparator {
|
||||
constexpr int operator()(u32 code_point, MappingType const& mapping)
|
||||
{
|
||||
return code_point - mapping.code_point;
|
||||
}
|
||||
};
|
||||
|
||||
struct BidiClassData {
|
||||
CodePointRange code_point_range {};
|
||||
BidirectionalClass bidi_class {};
|
||||
|
@ -784,45 +490,6 @@ struct CodePointBidiClassComparator : public CodePointRangeComparator {
|
|||
|
||||
)~~~");
|
||||
|
||||
auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> {
|
||||
generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)));
|
||||
generator.set("size", TRY(String::number(unique_properties.size())));
|
||||
|
||||
auto optional_code_point_to_string = [](auto const& code_point) -> ErrorOr<String> {
|
||||
if (!code_point.has_value())
|
||||
return "-1"_string;
|
||||
return String::number(*code_point);
|
||||
};
|
||||
auto first_index_to_string = [](auto const& list) -> ErrorOr<String> {
|
||||
if (list.is_empty())
|
||||
return "0"_string;
|
||||
return String::number(list.first());
|
||||
};
|
||||
|
||||
generator.append(R"~~~(
|
||||
static constexpr Array<CasingTable, @size@> @name@ { {)~~~");
|
||||
|
||||
for (auto const& casing : unique_properties) {
|
||||
generator.set("canonical_combining_class", TRY(String::number(casing.canonical_combining_class)));
|
||||
generator.set("simple_uppercase_mapping", TRY(optional_code_point_to_string(casing.simple_uppercase_mapping)));
|
||||
generator.set("simple_lowercase_mapping", TRY(optional_code_point_to_string(casing.simple_lowercase_mapping)));
|
||||
generator.set("simple_titlecase_mapping", TRY(optional_code_point_to_string(casing.simple_titlecase_mapping)));
|
||||
generator.set("special_casing_start_index", TRY(first_index_to_string(casing.special_casing_indices)));
|
||||
generator.set("special_casing_size", TRY(String::number(casing.special_casing_indices.size())));
|
||||
generator.set("case_folding_start_index", TRY(first_index_to_string(casing.case_folding_indices)));
|
||||
generator.set("case_folding_size", TRY(String::number(casing.case_folding_indices.size())));
|
||||
|
||||
generator.append(R"~~~(
|
||||
{ @canonical_combining_class@, @simple_uppercase_mapping@, @simple_lowercase_mapping@, @simple_titlecase_mapping@, @special_casing_start_index@, @special_casing_size@, @case_folding_start_index@, @case_folding_size@ },)~~~");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
)~~~");
|
||||
|
||||
return {};
|
||||
};
|
||||
|
||||
auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> {
|
||||
generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)));
|
||||
generator.set("outer_size", TRY(String::number(unique_properties.size())));
|
||||
|
@ -889,7 +556,6 @@ static constexpr Array<@type@, @size@> @name@ { {
|
|||
return {};
|
||||
};
|
||||
|
||||
TRY(append_code_point_tables("s_casings"sv, unicode_data.casing_tables, append_casing_table));
|
||||
TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table));
|
||||
TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table));
|
||||
TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
|
||||
|
@ -926,64 +592,6 @@ static constexpr Array<BidiClassData, @size@> s_bidirectional_classes { {
|
|||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
static CasingTable const& casing_table_for_code_point(u32 code_point)
|
||||
{
|
||||
auto stage1_index = code_point >> @CODE_POINT_TABLES_LSB_COUNT@;
|
||||
auto stage2_index = s_casings_stage1[stage1_index] + (code_point & @CODE_POINT_TABLES_LSB_MASK@);
|
||||
auto unique_properties_index = s_casings_stage2[stage2_index];
|
||||
|
||||
return s_casings_unique_properties[unique_properties_index];
|
||||
}
|
||||
)~~~");
|
||||
|
||||
auto append_code_point_mapping_search = [&](StringView method, StringView mapping, Optional<StringView> const& fallback = {}) {
|
||||
generator.set("method", method);
|
||||
generator.set("mapping", mapping);
|
||||
generator.append(R"~~~(
|
||||
u32 @method@(u32 code_point)
|
||||
{
|
||||
auto const& casing_table = casing_table_for_code_point(code_point);
|
||||
auto mapping = casing_table.@mapping@;
|
||||
)~~~");
|
||||
|
||||
if (fallback.has_value()) {
|
||||
generator.set("fallback", *fallback);
|
||||
generator.append(R"~~~(
|
||||
return mapping == -1 ? @fallback@ : static_cast<u32>(mapping);)~~~");
|
||||
} else {
|
||||
generator.append(R"~~~(
|
||||
return mapping;)~~~");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
}
|
||||
)~~~");
|
||||
};
|
||||
|
||||
append_code_point_mapping_search("canonical_combining_class"sv, "canonical_combining_class"sv);
|
||||
append_code_point_mapping_search("to_unicode_uppercase"sv, "simple_uppercase_mapping"sv, "code_point"sv);
|
||||
append_code_point_mapping_search("to_unicode_lowercase"sv, "simple_lowercase_mapping"sv, "code_point"sv);
|
||||
append_code_point_mapping_search("to_unicode_titlecase"sv, "simple_titlecase_mapping"sv, "code_point"sv);
|
||||
|
||||
generator.append(R"~~~(
|
||||
ReadonlySpan<SpecialCasing> special_case_mapping(u32 code_point)
|
||||
{
|
||||
auto const& casing_table = casing_table_for_code_point(code_point);
|
||||
if (casing_table.special_casing_size == 0)
|
||||
return {};
|
||||
|
||||
return s_special_case.span().slice(casing_table.special_casing_start_index, casing_table.special_casing_size);
|
||||
}
|
||||
|
||||
ReadonlySpan<CaseFolding> case_folding_mapping(u32 code_point)
|
||||
{
|
||||
auto const& casing_table = casing_table_for_code_point(code_point);
|
||||
if (casing_table.case_folding_size == 0)
|
||||
return {};
|
||||
|
||||
return s_case_folding.span().slice(casing_table.case_folding_start_index, casing_table.case_folding_size);
|
||||
}
|
||||
|
||||
Optional<BidirectionalClass> bidirectional_class(u32 code_point)
|
||||
{
|
||||
if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {}))
|
||||
|
@ -1036,8 +644,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
|
|||
return {};
|
||||
};
|
||||
|
||||
TRY(append_from_string("Locale"sv, "locale"sv, unicode_data.locales, {}));
|
||||
|
||||
TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv));
|
||||
TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases));
|
||||
|
||||
|
@ -1188,22 +794,6 @@ static ErrorOr<void> normalize_script_extensions(PropList& script_extensions, Pr
|
|||
return {};
|
||||
}
|
||||
|
||||
struct CasingMetadata {
|
||||
using ConstIterator = typename Vector<CodePointData>::ConstIterator;
|
||||
|
||||
CasingMetadata(Vector<CodePointData> const& code_point_data)
|
||||
: iterator(code_point_data.begin())
|
||||
, end(code_point_data.end())
|
||||
{
|
||||
}
|
||||
|
||||
ConstIterator iterator;
|
||||
ConstIterator const end;
|
||||
|
||||
Vector<size_t> current_block;
|
||||
HashMap<decltype(current_block), size_t> unique_blocks;
|
||||
};
|
||||
|
||||
struct PropertyMetadata {
|
||||
static ErrorOr<PropertyMetadata> create(PropList& property_list)
|
||||
{
|
||||
|
@ -1301,25 +891,6 @@ static ErrorOr<void> update_tables(u32 code_point, CodePointTables<T>& tables, a
|
|||
|
||||
static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
|
||||
{
|
||||
auto update_casing_tables = [&]<typename T>(u32 code_point, CodePointTables<T>& tables, CasingMetadata& metadata) -> ErrorOr<void> {
|
||||
CasingTable casing {};
|
||||
|
||||
while (metadata.iterator != metadata.end) {
|
||||
if (code_point < metadata.iterator->code_point)
|
||||
break;
|
||||
|
||||
if (code_point == metadata.iterator->code_point) {
|
||||
casing = move(metadata.iterator->casing);
|
||||
break;
|
||||
}
|
||||
|
||||
++metadata.iterator;
|
||||
}
|
||||
|
||||
TRY(update_tables(code_point, tables, metadata, casing));
|
||||
return {};
|
||||
};
|
||||
|
||||
auto update_property_tables = [&]<typename T>(u32 code_point, CodePointTables<T>& tables, PropertyMetadata& metadata) -> ErrorOr<void> {
|
||||
static Unicode::CodePointRangeComparator comparator {};
|
||||
|
||||
|
@ -1346,7 +917,6 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
|
|||
return {};
|
||||
};
|
||||
|
||||
CasingMetadata casing_metadata { unicode_data.code_point_data };
|
||||
auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories));
|
||||
auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list));
|
||||
auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
|
||||
|
@ -1356,7 +926,6 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
|
|||
auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props));
|
||||
|
||||
for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
|
||||
TRY(update_casing_tables(code_point, unicode_data.casing_tables, casing_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
|
||||
|
@ -1374,8 +943,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
StringView generated_header_path;
|
||||
StringView generated_implementation_path;
|
||||
StringView unicode_data_path;
|
||||
StringView special_casing_path;
|
||||
StringView case_folding_path;
|
||||
StringView derived_general_category_path;
|
||||
StringView prop_list_path;
|
||||
StringView derived_core_prop_path;
|
||||
|
@ -1394,8 +961,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
|
||||
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
|
||||
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
|
||||
args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
|
||||
args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path");
|
||||
args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
|
||||
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
|
||||
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
|
||||
|
@ -1415,8 +980,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
|
||||
auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read));
|
||||
auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read));
|
||||
auto special_casing_file = TRY(open_file(special_casing_path, Core::File::OpenMode::Read));
|
||||
auto case_folding_file = TRY(open_file(case_folding_path, Core::File::OpenMode::Read));
|
||||
auto prop_list_file = TRY(open_file(prop_list_path, Core::File::OpenMode::Read));
|
||||
auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::File::OpenMode::Read));
|
||||
auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::File::OpenMode::Read));
|
||||
|
@ -1431,8 +994,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read));
|
||||
|
||||
UnicodeData unicode_data {};
|
||||
TRY(parse_special_casing(*special_casing_file, unicode_data));
|
||||
TRY(parse_case_folding(*case_folding_file, unicode_data));
|
||||
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
|
||||
TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list));
|
||||
TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list));
|
||||
|
|
|
@ -592,7 +592,6 @@ TEST_CASE(to_titlecase)
|
|||
EXPECT_EQ(MUST("foo bar baz"_string.to_titlecase()), "Foo Bar Baz"sv);
|
||||
EXPECT_EQ(MUST("foo \n \r bar \t baz"_string.to_titlecase()), "Foo \n \r Bar \t Baz"sv);
|
||||
EXPECT_EQ(MUST("f\"oo\" b'ar'"_string.to_titlecase()), "F\"Oo\" B'ar'"sv);
|
||||
EXPECT_EQ(MUST("123dollars"_string.to_titlecase()), "123Dollars"sv);
|
||||
}
|
||||
|
||||
TEST_CASE(to_casefold)
|
||||
|
|
|
@ -8,82 +8,6 @@
|
|||
|
||||
#include <AK/StringView.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static void compare_to_ascii(auto& old_function, auto& new_function)
|
||||
{
|
||||
i64 result1 = 0;
|
||||
i64 result2 = 0;
|
||||
|
||||
for (u32 i = 0; i < 0x80; ++i) {
|
||||
EXPECT_EQ(result1 = old_function(i), result2 = new_function(i));
|
||||
if (result1 != result2)
|
||||
dbgln("Function input value was {}.", i);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(to_unicode_lowercase)
|
||||
{
|
||||
compare_to_ascii(tolower, Unicode::to_unicode_lowercase);
|
||||
|
||||
EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω"
|
||||
EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω"
|
||||
|
||||
// Code points encoded by ranges in UnicodeData.txt
|
||||
EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u);
|
||||
EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u);
|
||||
EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u);
|
||||
EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu);
|
||||
}
|
||||
|
||||
TEST_CASE(to_unicode_uppercase)
|
||||
{
|
||||
compare_to_ascii(toupper, Unicode::to_unicode_uppercase);
|
||||
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω"
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
|
||||
|
||||
// Code points encoded by ranges in UnicodeData.txt
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u);
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
|
||||
|
||||
// Code points whose uppercase and titlecase mappings actually differ.
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ"
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ"
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ"
|
||||
EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ"
|
||||
}
|
||||
|
||||
TEST_CASE(to_unicode_titlecase)
|
||||
{
|
||||
compare_to_ascii(toupper, Unicode::to_unicode_titlecase);
|
||||
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω"
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
|
||||
|
||||
// Code points encoded by ranges in UnicodeData.txt
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u);
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u);
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u);
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu);
|
||||
|
||||
// Code points whose uppercase and titlecase mappings actually differ.
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž"
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj"
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj"
|
||||
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz"
|
||||
}
|
||||
|
||||
BENCHMARK_CASE(casing)
|
||||
{
|
||||
for (size_t i = 0; i < 50'000; ++i) {
|
||||
__test_to_unicode_lowercase();
|
||||
__test_to_unicode_uppercase();
|
||||
__test_to_unicode_titlecase();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(general_category)
|
||||
{
|
||||
|
|
|
@ -713,10 +713,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
|
|||
|
||||
bool equal;
|
||||
if (input.regex_options & AllFlags::Insensitive) {
|
||||
if (input.view.unicode())
|
||||
equal = Unicode::equals_ignoring_case(Utf32View { &input_view, 1 }, Utf32View { &ch1, 1 });
|
||||
else
|
||||
if (input.view.unicode()) {
|
||||
auto lhs = String::from_code_point(input_view);
|
||||
auto rhs = String::from_code_point(ch1);
|
||||
|
||||
equal = lhs.equals_ignoring_case(rhs);
|
||||
} else {
|
||||
equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1);
|
||||
}
|
||||
} else {
|
||||
equal = input_view == ch1;
|
||||
}
|
||||
|
|
|
@ -8,7 +8,6 @@ set(SOURCES
|
|||
Normalize.cpp
|
||||
Segmentation.cpp
|
||||
String.cpp
|
||||
UnicodeUtils.cpp
|
||||
${UNICODE_DATA_SOURCES}
|
||||
)
|
||||
set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED})
|
||||
|
|
|
@ -1,19 +1,10 @@
|
|||
/*
|
||||
* Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Platform.h>
|
||||
#include <AK/ScopeGuard.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/UnicodeUtils.h>
|
||||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
# include <LibUnicode/UnicodeData.h>
|
||||
|
@ -21,129 +12,6 @@
|
|||
|
||||
namespace Unicode {
|
||||
|
||||
u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; }
|
||||
|
||||
u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point)
|
||||
{
|
||||
return to_ascii_lowercase(code_point);
|
||||
}
|
||||
|
||||
u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point)
|
||||
{
|
||||
return to_ascii_uppercase(code_point);
|
||||
}
|
||||
|
||||
u32 __attribute__((weak)) to_unicode_titlecase(u32 code_point)
|
||||
{
|
||||
return to_ascii_uppercase(code_point);
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
class CasefoldStringComparator {
|
||||
public:
|
||||
explicit CasefoldStringComparator(ViewType string)
|
||||
: m_string(string)
|
||||
, m_it(m_string.begin())
|
||||
{
|
||||
}
|
||||
|
||||
bool has_more_data() const
|
||||
{
|
||||
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
|
||||
}
|
||||
|
||||
size_t index() const
|
||||
{
|
||||
if constexpr (IsSame<ViewType, Utf8View>)
|
||||
return m_string.byte_offset_of(m_it);
|
||||
else if constexpr (IsSame<ViewType, Utf16View>)
|
||||
return m_string.code_unit_offset_of(m_it);
|
||||
else if constexpr (IsSame<ViewType, Utf32View>)
|
||||
return m_string.iterator_offset(m_it);
|
||||
else
|
||||
static_assert(DependentFalse<ViewType>);
|
||||
}
|
||||
|
||||
u32 next_code_point()
|
||||
{
|
||||
VERIFY(has_more_data());
|
||||
|
||||
if (m_casefolded_code_points.is_empty()) {
|
||||
m_current_code_point = *m_it;
|
||||
++m_it;
|
||||
|
||||
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
|
||||
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
|
||||
}
|
||||
|
||||
auto code_point = m_casefolded_code_points[0];
|
||||
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
|
||||
|
||||
return code_point;
|
||||
}
|
||||
|
||||
private:
|
||||
ViewType m_string;
|
||||
typename ViewType::Iterator m_it;
|
||||
|
||||
u32 m_current_code_point { 0 };
|
||||
Utf32View m_casefolded_code_points;
|
||||
};
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
|
||||
template<typename ViewType>
|
||||
bool equals_ignoring_case(ViewType lhs, ViewType rhs)
|
||||
{
|
||||
// A string X is a caseless match for a string Y if and only if:
|
||||
// toCasefold(X) = toCasefold(Y)
|
||||
|
||||
CasefoldStringComparator lhs_comparator { lhs };
|
||||
CasefoldStringComparator rhs_comparator { rhs };
|
||||
|
||||
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
|
||||
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point())
|
||||
return false;
|
||||
}
|
||||
|
||||
return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data();
|
||||
}
|
||||
|
||||
template bool equals_ignoring_case(Utf8View, Utf8View);
|
||||
template bool equals_ignoring_case(Utf16View, Utf16View);
|
||||
template bool equals_ignoring_case(Utf32View, Utf32View);
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> find_ignoring_case(ViewType lhs, ViewType rhs)
|
||||
{
|
||||
CasefoldStringComparator lhs_comparator { lhs };
|
||||
|
||||
while (lhs_comparator.has_more_data()) {
|
||||
CasefoldStringComparator rhs_comparator { rhs };
|
||||
|
||||
auto saved_state = lhs_comparator;
|
||||
auto matches = true;
|
||||
|
||||
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
|
||||
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) {
|
||||
matches = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matches && !rhs_comparator.has_more_data())
|
||||
return saved_state.index();
|
||||
|
||||
lhs_comparator = move(saved_state);
|
||||
lhs_comparator.next_code_point();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
template Optional<size_t> find_ignoring_case(Utf8View, Utf8View);
|
||||
template Optional<size_t> find_ignoring_case(Utf16View, Utf16View);
|
||||
template Optional<size_t> find_ignoring_case(Utf32View, Utf32View);
|
||||
|
||||
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
|
||||
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
|
||||
|
|
|
@ -1,18 +1,15 @@
|
|||
/*
|
||||
* Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/ByteString.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/Span.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
@ -29,20 +26,6 @@ struct CodePointRangeComparator {
|
|||
}
|
||||
};
|
||||
|
||||
u32 canonical_combining_class(u32 code_point);
|
||||
|
||||
// Note: The single code point case conversions only perform simple case folding.
|
||||
// Use the full-string transformations for full case folding.
|
||||
u32 to_unicode_lowercase(u32 code_point);
|
||||
u32 to_unicode_uppercase(u32 code_point);
|
||||
u32 to_unicode_titlecase(u32 code_point);
|
||||
|
||||
template<typename ViewType>
|
||||
bool equals_ignoring_case(ViewType, ViewType);
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> find_ignoring_case(ViewType, ViewType);
|
||||
|
||||
Optional<GeneralCategory> general_category_from_string(StringView);
|
||||
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2021-2022, Tim Flynn <trflynn89@serenityos.org>
|
||||
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
@ -11,7 +11,6 @@
|
|||
namespace Unicode {
|
||||
|
||||
enum class BidirectionalClass : u8;
|
||||
enum class Block : u16;
|
||||
enum class EmojiGroup : u8;
|
||||
enum class GeneralCategory : u8;
|
||||
enum class GraphemeBreakProperty : u8;
|
||||
|
@ -20,9 +19,7 @@ enum class Script : u8;
|
|||
enum class SentenceBreakProperty : u8;
|
||||
enum class WordBreakProperty : u8;
|
||||
|
||||
struct CodePointDecomposition;
|
||||
struct CurrencyCode;
|
||||
struct Emoji;
|
||||
struct SpecialCasing;
|
||||
|
||||
}
|
||||
|
|
|
@ -1,57 +1,141 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
* Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#define AK_DONT_REPLACE_STD
|
||||
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/UnicodeUtils.h>
|
||||
#include <LibLocale/ICU.h>
|
||||
|
||||
#include <unicode/bytestream.h>
|
||||
#include <unicode/casemap.h>
|
||||
#include <unicode/stringoptions.h>
|
||||
|
||||
// This file contains definitions of AK::String methods which require UCD data.
|
||||
|
||||
namespace AK {
|
||||
|
||||
struct ResolvedLocale {
|
||||
ByteString buffer;
|
||||
char const* locale { nullptr };
|
||||
};
|
||||
|
||||
static ResolvedLocale resolve_locale(Optional<StringView> const& locale)
|
||||
{
|
||||
if (!locale.has_value())
|
||||
return {};
|
||||
|
||||
ResolvedLocale resolved_locale;
|
||||
resolved_locale.buffer = *locale;
|
||||
resolved_locale.locale = resolved_locale.buffer.characters();
|
||||
|
||||
return resolved_locale;
|
||||
}
|
||||
|
||||
ErrorOr<String> String::to_lowercase(Optional<StringView> const& locale) const
|
||||
{
|
||||
StringBuilder builder;
|
||||
TRY(Unicode::Detail::build_lowercase_string(code_points(), builder, locale));
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
StringBuilder builder { bytes_as_string_view().length() };
|
||||
icu::StringByteSink sink { &builder };
|
||||
|
||||
auto resolved_locale = resolve_locale(locale);
|
||||
|
||||
icu::CaseMap::utf8ToLower(resolved_locale.locale, 0, Locale::icu_string_piece(*this), sink, nullptr, status);
|
||||
if (Locale::icu_failure(status))
|
||||
return Error::from_string_literal("Unable to convert string to lowercase");
|
||||
|
||||
return builder.to_string_without_validation();
|
||||
}
|
||||
|
||||
ErrorOr<String> String::to_uppercase(Optional<StringView> const& locale) const
|
||||
{
|
||||
StringBuilder builder;
|
||||
TRY(Unicode::Detail::build_uppercase_string(code_points(), builder, locale));
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
StringBuilder builder { bytes_as_string_view().length() };
|
||||
icu::StringByteSink sink { &builder };
|
||||
|
||||
auto resolved_locale = resolve_locale(locale);
|
||||
|
||||
icu::CaseMap::utf8ToUpper(resolved_locale.locale, 0, Locale::icu_string_piece(*this), sink, nullptr, status);
|
||||
if (Locale::icu_failure(status))
|
||||
return Error::from_string_literal("Unable to convert string to uppercase");
|
||||
|
||||
return builder.to_string_without_validation();
|
||||
}
|
||||
|
||||
ErrorOr<String> String::to_titlecase(Optional<StringView> const& locale, TrailingCodePointTransformation trailing_code_point_transformation) const
|
||||
{
|
||||
StringBuilder builder;
|
||||
TRY(Unicode::Detail::build_titlecase_string(code_points(), builder, locale, trailing_code_point_transformation));
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
StringBuilder builder { bytes_as_string_view().length() };
|
||||
icu::StringByteSink sink { &builder };
|
||||
|
||||
auto resolved_locale = resolve_locale(locale);
|
||||
|
||||
u32 options = 0;
|
||||
if (trailing_code_point_transformation == TrailingCodePointTransformation::PreserveExisting)
|
||||
options |= U_TITLECASE_NO_LOWERCASE;
|
||||
|
||||
icu::CaseMap::utf8ToTitle(resolved_locale.locale, options, nullptr, Locale::icu_string_piece(*this), sink, nullptr, status);
|
||||
if (Locale::icu_failure(status))
|
||||
return Error::from_string_literal("Unable to convert string to titlecase");
|
||||
|
||||
return builder.to_string_without_validation();
|
||||
}
|
||||
|
||||
static ErrorOr<void> build_casefold_string(StringView string, StringBuilder& builder)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
icu::StringByteSink sink { &builder };
|
||||
|
||||
icu::CaseMap::utf8Fold(0, Locale::icu_string_piece(string), sink, nullptr, status);
|
||||
if (Locale::icu_failure(status))
|
||||
return Error::from_string_literal("Unable to casefold string");
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<String> String::to_casefold() const
|
||||
{
|
||||
StringBuilder builder;
|
||||
TRY(Unicode::Detail::build_casefold_string(code_points(), builder));
|
||||
StringBuilder builder { bytes_as_string_view().length() };
|
||||
TRY(build_casefold_string(*this, builder));
|
||||
|
||||
return builder.to_string_without_validation();
|
||||
}
|
||||
|
||||
bool String::equals_ignoring_case(String const& other) const
|
||||
{
|
||||
return Unicode::equals_ignoring_case(code_points(), other.code_points());
|
||||
StringBuilder lhs_builder { bytes_as_string_view().length() };
|
||||
if (build_casefold_string(*this, lhs_builder).is_error())
|
||||
return false;
|
||||
|
||||
StringBuilder rhs_builder { other.bytes_as_string_view().length() };
|
||||
if (build_casefold_string(other, rhs_builder).is_error())
|
||||
return false;
|
||||
|
||||
return lhs_builder.string_view() == rhs_builder.string_view();
|
||||
}
|
||||
|
||||
Optional<size_t> String::find_byte_offset_ignoring_case(StringView needle, size_t from_byte_offset) const
|
||||
{
|
||||
auto haystack = code_points().substring_view(from_byte_offset);
|
||||
auto haystack = bytes_as_string_view().substring_view(from_byte_offset);
|
||||
if (haystack.is_empty())
|
||||
return {};
|
||||
|
||||
if (auto index = Unicode::find_ignoring_case(haystack, Utf8View { needle }); index.has_value())
|
||||
StringBuilder lhs_builder { haystack.length() };
|
||||
if (build_casefold_string(haystack, lhs_builder).is_error())
|
||||
return {};
|
||||
|
||||
StringBuilder rhs_builder { needle.length() };
|
||||
if (build_casefold_string(needle, rhs_builder).is_error())
|
||||
return false;
|
||||
|
||||
if (auto index = lhs_builder.string_view().find(rhs_builder.string_view()); index.has_value())
|
||||
return *index + from_byte_offset;
|
||||
|
||||
return {};
|
||||
|
|
|
@ -1,368 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Platform.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/Segmentation.h>
|
||||
#include <LibUnicode/UnicodeUtils.h>
|
||||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
# include <LibUnicode/UnicodeData.h>
|
||||
#endif
|
||||
|
||||
// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
|
||||
|
||||
namespace Unicode::Detail {
|
||||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
|
||||
static bool is_after_uppercase_i(Utf8View const& string, size_t index)
|
||||
{
|
||||
// There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
|
||||
auto preceding_view = string.substring_view(0, index);
|
||||
bool found_uppercase_i = false;
|
||||
|
||||
// FIXME: Would be better if Utf8View supported reverse iteration.
|
||||
for (auto code_point : preceding_view) {
|
||||
if (code_point == 'I') {
|
||||
found_uppercase_i = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto combining_class = canonical_combining_class(code_point);
|
||||
if (combining_class == 0 || combining_class == 230)
|
||||
found_uppercase_i = false;
|
||||
}
|
||||
|
||||
return found_uppercase_i;
|
||||
}
|
||||
|
||||
static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
|
||||
{
|
||||
// There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
|
||||
auto preceding_view = string.substring_view(0, index);
|
||||
bool found_soft_dotted_code_point = false;
|
||||
|
||||
// FIXME: Would be better if Utf8View supported reverse iteration.
|
||||
for (auto code_point : preceding_view) {
|
||||
if (code_point_has_property(code_point, Property::Soft_Dotted)) {
|
||||
found_soft_dotted_code_point = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto combining_class = canonical_combining_class(code_point);
|
||||
if (combining_class == 0 || combining_class == 230)
|
||||
found_soft_dotted_code_point = false;
|
||||
}
|
||||
|
||||
return found_soft_dotted_code_point;
|
||||
}
|
||||
|
||||
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
|
||||
{
|
||||
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
|
||||
// characters, and C is not followed by a sequence consisting of zero or more case-ignorable
|
||||
// characters and then a cased letter.
|
||||
auto preceding_view = string.substring_view(0, index);
|
||||
auto following_view = ((index + byte_length) < string.byte_length())
|
||||
? string.substring_view(index + byte_length)
|
||||
: Utf8View {};
|
||||
|
||||
size_t cased_letter_count = 0;
|
||||
|
||||
for (auto code_point : preceding_view) {
|
||||
bool is_cased = code_point_has_property(code_point, Property::Cased);
|
||||
bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
|
||||
|
||||
if (is_cased && !is_case_ignorable)
|
||||
++cased_letter_count;
|
||||
else if (!is_case_ignorable)
|
||||
cased_letter_count = 0;
|
||||
}
|
||||
|
||||
if (cased_letter_count == 0)
|
||||
return false;
|
||||
|
||||
for (auto code_point : following_view) {
|
||||
bool is_cased = code_point_has_property(code_point, Property::Cased);
|
||||
bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
|
||||
|
||||
if (is_case_ignorable)
|
||||
continue;
|
||||
if (is_cased)
|
||||
return false;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
|
||||
{
|
||||
// C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
|
||||
auto following_view = ((index + byte_length) < string.byte_length())
|
||||
? string.substring_view(index + byte_length)
|
||||
: Utf8View {};
|
||||
|
||||
for (auto code_point : following_view) {
|
||||
u32 combining_class = canonical_combining_class(code_point);
|
||||
|
||||
if (combining_class == 0)
|
||||
return false;
|
||||
if (combining_class == 230)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
|
||||
{
|
||||
// C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
|
||||
// intervene between the current character and the combining dot above.
|
||||
auto following_view = ((index + byte_length) < string.byte_length())
|
||||
? string.substring_view(index + byte_length)
|
||||
: Utf8View {};
|
||||
|
||||
for (auto code_point : following_view) {
|
||||
if (code_point == 0x307)
|
||||
return true;
|
||||
|
||||
u32 combining_class = canonical_combining_class(code_point);
|
||||
|
||||
if (combining_class == 0)
|
||||
return false;
|
||||
if (combining_class == 230)
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static Optional<SpecialCasing const&> find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
|
||||
{
|
||||
auto requested_locale = Locale::None;
|
||||
|
||||
if (locale.has_value()) {
|
||||
if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
|
||||
requested_locale = *maybe_locale;
|
||||
}
|
||||
|
||||
auto special_casings = special_case_mapping(code_point);
|
||||
|
||||
for (auto const& special_casing : special_casings) {
|
||||
if (special_casing.locale != Locale::None && special_casing.locale != requested_locale)
|
||||
continue;
|
||||
|
||||
switch (special_casing.condition) {
|
||||
case Condition::None:
|
||||
return special_casing;
|
||||
|
||||
case Condition::AfterI:
|
||||
if (is_after_uppercase_i(string, index))
|
||||
return special_casing;
|
||||
break;
|
||||
|
||||
case Condition::AfterSoftDotted:
|
||||
if (is_after_soft_dotted_code_point(string, index))
|
||||
return special_casing;
|
||||
break;
|
||||
|
||||
case Condition::FinalSigma:
|
||||
if (is_final_code_point(string, index, byte_length))
|
||||
return special_casing;
|
||||
break;
|
||||
|
||||
case Condition::MoreAbove:
|
||||
if (is_followed_by_combining_class_above(string, index, byte_length))
|
||||
return special_casing;
|
||||
break;
|
||||
|
||||
case Condition::NotBeforeDot:
|
||||
if (!is_followed_by_combining_dot_above(string, index, byte_length))
|
||||
return special_casing;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
template<CaseFoldingStatus... StatusFilter>
|
||||
static Optional<CaseFolding const&> find_matching_case_folding(u32 code_point)
|
||||
{
|
||||
auto case_foldings = case_folding_mapping(code_point);
|
||||
|
||||
for (auto const& case_folding : case_foldings) {
|
||||
if (((case_folding.status == StatusFilter) || ...))
|
||||
return case_folding;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
|
||||
ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
size_t index = 0;
|
||||
size_t byte_length = 0;
|
||||
|
||||
for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
|
||||
u32 code_point = *it;
|
||||
byte_length = it.underlying_code_point_length_in_bytes();
|
||||
|
||||
auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
|
||||
if (!special_casing.has_value()) {
|
||||
TRY(builder.try_append_code_point(to_unicode_lowercase(code_point)));
|
||||
continue;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
|
||||
TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i]));
|
||||
}
|
||||
|
||||
return {};
|
||||
#else
|
||||
return Error::from_string_literal("Unicode data has been disabled");
|
||||
#endif
|
||||
}
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
|
||||
ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
size_t index = 0;
|
||||
size_t byte_length = 0;
|
||||
|
||||
for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
|
||||
u32 code_point = *it;
|
||||
byte_length = it.underlying_code_point_length_in_bytes();
|
||||
|
||||
auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
|
||||
if (!special_casing.has_value()) {
|
||||
TRY(builder.try_append_code_point(to_unicode_uppercase(code_point)));
|
||||
continue;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
|
||||
TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i]));
|
||||
}
|
||||
|
||||
return {};
|
||||
#else
|
||||
return Error::from_string_literal("Unicode data has been disabled");
|
||||
#endif
|
||||
}
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
|
||||
ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale, [[maybe_unused]] TrailingCodePointTransformation trailing_code_point_transformation)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
// toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
|
||||
// “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
|
||||
// the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
|
||||
// F and the following word boundary to Lowercase_Mapping(C).
|
||||
|
||||
auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
|
||||
auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
|
||||
auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
|
||||
|
||||
for (; it != end; ++it) {
|
||||
if (code_point_has_property(*it, Property::Cased))
|
||||
return it;
|
||||
}
|
||||
|
||||
return {};
|
||||
};
|
||||
|
||||
auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
|
||||
auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
|
||||
if (!special_casing.has_value()) {
|
||||
TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
|
||||
return {};
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
|
||||
TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
|
||||
return {};
|
||||
};
|
||||
|
||||
size_t boundary = 0;
|
||||
|
||||
while (true) {
|
||||
auto next_boundary = next_word_segmentation_boundary(code_points, boundary);
|
||||
if (!next_boundary.has_value())
|
||||
break;
|
||||
|
||||
if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) {
|
||||
auto code_point = *it.value();
|
||||
auto code_point_offset = code_points.byte_offset_of(*it);
|
||||
auto code_point_length = it->underlying_code_point_length_in_bytes();
|
||||
|
||||
auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
|
||||
TRY(builder.try_append(caseless_code_points.as_string()));
|
||||
|
||||
TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
|
||||
boundary = code_point_offset + code_point_length;
|
||||
}
|
||||
|
||||
auto remaining_code_points = code_points.substring_view(boundary, *next_boundary - boundary);
|
||||
switch (trailing_code_point_transformation) {
|
||||
case TrailingCodePointTransformation::Lowercase:
|
||||
TRY(build_lowercase_string(remaining_code_points, builder, locale));
|
||||
break;
|
||||
case TrailingCodePointTransformation::PreserveExisting:
|
||||
TRY(builder.try_append(remaining_code_points.as_string()));
|
||||
break;
|
||||
}
|
||||
|
||||
boundary = *next_boundary;
|
||||
}
|
||||
|
||||
return {};
|
||||
#else
|
||||
return Error::from_string_literal("Unicode data has been disabled");
|
||||
#endif
|
||||
}
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
|
||||
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder)
|
||||
{
|
||||
// toCasefold(X): Map each character C in X to Case_Folding(C).
|
||||
for (auto code_point : code_points) {
|
||||
auto case_folding = casefold_code_point(code_point);
|
||||
TRY(builder.try_append(case_folding));
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
|
||||
Utf32View casefold_code_point(u32 const& code_point)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
// Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
|
||||
// CaseFolding.txt in the Unicode Character Database.
|
||||
using enum CaseFoldingStatus;
|
||||
|
||||
if (auto case_folding = find_matching_case_folding<Common, Full>(code_point); case_folding.has_value())
|
||||
return Utf32View { case_folding->mapping, case_folding->mapping_size };
|
||||
#endif
|
||||
|
||||
// The case foldings are omitted in the data file if they are the same as the code point itself.
|
||||
return Utf32View { &code_point, 1 };
|
||||
}
|
||||
|
||||
}
|
|
@ -1,24 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Error.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
||||
namespace Unicode::Detail {
|
||||
|
||||
ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
|
||||
ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
|
||||
ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale, TrailingCodePointTransformation trailing_code_point_transformation);
|
||||
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder);
|
||||
Utf32View casefold_code_point(u32 const& code_point);
|
||||
|
||||
}
|
Loading…
Reference in a new issue