LibUnicode: Remove the locale-unaware text segmentation implementation

This commit is contained in:
Timothy Flynn 2024-06-19 13:36:37 -04:00 committed by Andreas Kling
parent 3974996e95
commit ab56b8c8dc
Notes: sideshowbarker 2024-07-17 20:33:50 +09:00
10 changed files with 1 additions and 808 deletions

View file

@ -43,15 +43,6 @@ set(EMOJI_DATA_PATH "${UCD_PATH}/${EMOJI_DATA_SOURCE}")
set(NORM_PROPS_SOURCE "DerivedNormalizationProps.txt")
set(NORM_PROPS_PATH "${UCD_PATH}/${NORM_PROPS_SOURCE}")
set(GRAPHEME_BREAK_PROP_SOURCE "auxiliary/GraphemeBreakProperty.txt")
set(GRAPHEME_BREAK_PROP_PATH "${UCD_PATH}/${GRAPHEME_BREAK_PROP_SOURCE}")
set(WORD_BREAK_PROP_SOURCE "auxiliary/WordBreakProperty.txt")
set(WORD_BREAK_PROP_PATH "${UCD_PATH}/${WORD_BREAK_PROP_SOURCE}")
set(SENTENCE_BREAK_PROP_SOURCE "auxiliary/SentenceBreakProperty.txt")
set(SENTENCE_BREAK_PROP_PATH "${UCD_PATH}/${SENTENCE_BREAK_PROP_SOURCE}")
string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}")
set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt")
set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt")
@ -76,9 +67,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${EMOJI_DATA_SOURCE}" "${EMOJI_DATA_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NORM_PROPS_SOURCE}" "${NORM_PROPS_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${GRAPHEME_BREAK_PROP_SOURCE}" "${GRAPHEME_BREAK_PROP_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${WORD_BREAK_PROP_SOURCE}" "${WORD_BREAK_PROP_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SENTENCE_BREAK_PROP_SOURCE}" "${SENTENCE_BREAK_PROP_PATH}")
download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}")
else()
@ -103,7 +91,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UCD_VERSION_FILE}"
"${UNICODE_DATA_HEADER}"
"${UNICODE_DATA_IMPLEMENTATION}"
arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}"
)
invoke_generator(
"EmojiData"

View file

@ -98,17 +98,10 @@ struct UnicodeData {
// FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
NormalizationProps normalization_props;
PropList grapheme_break_props;
PropList word_break_props;
PropList sentence_break_props;
CodePointTables<PropertyTable> general_category_tables;
CodePointTables<PropertyTable> property_tables;
CodePointTables<PropertyTable> script_tables;
CodePointTables<PropertyTable> script_extension_tables;
CodePointTables<PropertyTable> grapheme_break_tables;
CodePointTables<PropertyTable> word_break_tables;
CodePointTables<PropertyTable> sentence_break_tables;
HashTable<ByteString> bidirectional_classes;
Vector<CodePointBidiClass> code_point_bidirectional_classes;
@ -439,9 +432,6 @@ namespace Unicode {
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
generator.append(R"~~~(
@ -560,9 +550,6 @@ static constexpr Array<@type@, @size@> @name@ { {
TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table));
TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table));
TRY(append_code_point_tables("s_grapheme_break_properties"sv, unicode_data.grapheme_break_tables, append_property_table));
TRY(append_code_point_tables("s_word_break_properties"sv, unicode_data.word_break_tables, append_property_table));
TRY(append_code_point_tables("s_sentence_break_properties"sv, unicode_data.sentence_break_tables, append_property_table));
{
constexpr size_t max_bidi_classes_per_row = 20;
@ -654,10 +641,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv));
TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases));
TRY(append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv));
TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv));
TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv));
TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {}));
generator.append(R"~~~(
@ -921,18 +904,12 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list));
auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions));
auto grapheme_break_metadata = TRY(PropertyMetadata::create(unicode_data.grapheme_break_props));
auto word_break_metadata = TRY(PropertyMetadata::create(unicode_data.word_break_props));
auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props));
for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata));
TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata));
TRY(update_property_tables(code_point, unicode_data.grapheme_break_tables, grapheme_break_metadata));
TRY(update_property_tables(code_point, unicode_data.word_break_tables, word_break_metadata));
TRY(update_property_tables(code_point, unicode_data.sentence_break_tables, sentence_break_metadata));
}
return {};
@ -953,9 +930,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
StringView script_extensions_path;
StringView emoji_data_path;
StringView normalization_path;
StringView grapheme_break_path;
StringView word_break_path;
StringView sentence_break_path;
Core::ArgsParser args_parser;
args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
@ -971,9 +945,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path");
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
args_parser.add_option(sentence_break_path, "Path to SentenceBreakProperty.txt file", "sentence-break-path", 'i', "sentence-break-path");
args_parser.parse(arguments);
auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
@ -989,9 +960,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read));
auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read));
auto grapheme_break_file = TRY(open_file(grapheme_break_path, Core::File::OpenMode::Read));
auto word_break_file = TRY(open_file(word_break_path, Core::File::OpenMode::Read));
auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read));
UnicodeData unicode_data {};
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
@ -1003,9 +971,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases));
TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
TRY(parse_prop_list(*grapheme_break_file, unicode_data.grapheme_break_props));
TRY(parse_prop_list(*word_break_file, unicode_data.word_break_props));
TRY(parse_prop_list(*sentence_break_file, unicode_data.sentence_break_props));
populate_general_category_unions(unicode_data.general_categories);
TRY(parse_unicode_data(*unicode_data_file, unicode_data));

View file

@ -1,7 +1,6 @@
set(TEST_SOURCES
TestEmoji.cpp
TestIDNA.cpp
TestSegmentation.cpp
TestUnicodeCharacterTypes.cpp
TestUnicodeNormalization.cpp
)

View file

@ -1,122 +0,0 @@
/*
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Array.h>
#include <AK/StringView.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h>
#include <LibTest/TestCase.h>
#include <LibUnicode/Segmentation.h>
template<size_t N>
static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
{
Vector<size_t> boundaries;
Utf8View view { string };
Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
}
TEST_CASE(grapheme_segmentation)
{
Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) {
VERIFY_NOT_REACHED();
return IterationDecision::Break;
});
test_grapheme_segmentation("a"sv, { 0u, 1u });
test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
test_grapheme_segmentation("a👩🏼👨🏻b"sv, { 0u, 1u, 29u, 30u });
}
TEST_CASE(grapheme_segmentation_indic_conjunct_break)
{
test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
}
template<size_t N>
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
{
Vector<size_t> boundaries;
Utf8View view { string };
Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
}
TEST_CASE(word_segmentation)
{
Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) {
VERIFY_NOT_REACHED();
return IterationDecision::Break;
});
test_word_segmentation("a"sv, { 0u, 1u });
test_word_segmentation("ab"sv, { 0u, 2u });
test_word_segmentation("abc"sv, { 0u, 3u });
test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
test_word_segmentation("a👩🏼👨🏻b"sv, { 0u, 1u, 29u, 30u });
test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
test_word_segmentation(
"The quick (“brown”) fox cant jump 32.3 feet, right?"sv,
{ 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
}

View file

@ -6,7 +6,6 @@ set(SOURCES
Emoji.cpp
IDNA.cpp
Normalize.cpp
Segmentation.cpp
String.cpp
${UNICODE_DATA_SOURCES}
)

View file

@ -88,10 +88,6 @@ Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {
bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }

View file

@ -37,10 +37,6 @@ Optional<Script> script_from_string(StringView);
bool code_point_has_script(u32 code_point, Script script);
bool code_point_has_script_extension(u32 code_point, Script script);
bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakProperty property);
bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
Optional<BidirectionalClass> bidirectional_class_from_string(StringView);
Optional<BidirectionalClass> bidirectional_class(u32 code_point);

View file

@ -13,11 +13,8 @@ namespace Unicode {
enum class BidirectionalClass : u8;
enum class EmojiGroup : u8;
enum class GeneralCategory : u8;
enum class GraphemeBreakProperty : u8;
enum class Property : u8;
enum class Script : u8;
enum class SentenceBreakProperty : u8;
enum class WordBreakProperty : u8;
struct CurrencyCode;
struct Emoji;

View file

@ -1,491 +0,0 @@
/*
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/Segmentation.h>
#if ENABLE_UNICODE_DATA
# include <LibUnicode/UnicodeData.h>
#endif
namespace Unicode {
template<typename ViewType>
static size_t code_unit_length(ViewType const& view)
{
if constexpr (IsSame<ViewType, Utf8View>)
return view.byte_length();
else if constexpr (IsSame<ViewType, Utf16View>)
return view.length_in_code_units();
else if constexpr (IsSame<ViewType, Utf32View>)
return view.length();
else
static_assert(DependentFalse<ViewType>);
}
template<typename ViewType, typename CodeUnitIterator>
static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
{
if constexpr (IsSame<ViewType, Utf8View>)
return view.byte_offset_of(it);
else if constexpr (IsSame<ViewType, Utf16View>)
return view.code_unit_offset_of(it);
else if constexpr (IsSame<ViewType, Utf32View>)
return view.iterator_offset(it);
else
static_assert(DependentFalse<ViewType>);
}
template<typename ViewType>
static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
{
#if ENABLE_UNICODE_DATA
using GBP = GraphemeBreakProperty;
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
if (view.is_empty())
return;
auto has_any_gbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_grapheme_break_property(code_point, properties) || ...);
};
auto skip_incb_extend_linker_sequence = [&](auto& it) {
while (true) {
if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
return;
auto next_it = it;
++next_it;
if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
return;
it = next_it;
++it;
}
};
// GB1
if (callback(0) == IterationDecision::Break)
return;
if (code_unit_length(view) > 1) {
auto it = view.begin();
auto code_point = *it;
u32 next_code_point = 0;
auto current_ri_chain = 0;
for (++it; it != view.end(); ++it, code_point = next_code_point) {
next_code_point = *it;
// GB9c
if (code_point_has_property(code_point, Property::InCB_Consonant)) {
auto it_copy = it;
skip_incb_extend_linker_sequence(it_copy);
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
++it_copy;
skip_incb_extend_linker_sequence(it_copy);
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
next_code_point = *it_copy;
it = it_copy;
continue;
}
}
}
// GB11
if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
auto it_copy = it;
while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
++it_copy;
if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
++it_copy;
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
next_code_point = *it_copy;
it = it_copy;
continue;
}
}
}
auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
// GB3
if (code_point_is_cr && next_code_point_is_lf)
continue;
// GB4, GB5
if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
continue;
}
auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
// GB6
if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
continue;
// GB7
if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
continue;
// GB8
if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
continue;
// GB9
if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
continue;
// GB9a
if (has_any_gbp(next_code_point, GBP::SpacingMark))
continue;
// GB9b
if (has_any_gbp(code_point, GBP::Prepend))
continue;
auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
// GB12, GB13
if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
continue;
// GB999
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
}
}
// GB2
callback(code_unit_length(view));
#endif
}
void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
{
for_each_grapheme_segmentation_boundary_impl(view, move(callback));
}
void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
{
for_each_grapheme_segmentation_boundary_impl(view, move(callback));
}
void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
{
for_each_grapheme_segmentation_boundary_impl(view, move(callback));
}
template<typename ViewType>
static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
{
#if ENABLE_UNICODE_DATA
using WBP = WordBreakProperty;
// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
if (view.is_empty())
return;
auto has_any_wbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_word_break_property(code_point, properties) || ...);
};
// WB1
if (callback(0) == IterationDecision::Break)
return;
if (code_unit_length(view) > 1) {
auto it = view.begin();
auto code_point = *it;
u32 next_code_point;
Optional<u32> previous_code_point;
auto current_ri_chain = 0;
for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
next_code_point = *it;
auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
// WB3
if (code_point_is_cr && next_code_point_is_lf)
continue;
// WB3a, WB3b
if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
continue;
}
// WB3c
if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
continue;
// WB3d
if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
continue;
// WB4
if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
continue;
auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
// WB5
if (code_point_is_ah_letter && next_code_point_is_ah_letter)
continue;
Optional<u32> next_next_code_point;
if (it != view.end()) {
auto it_copy = it;
++it_copy;
if (it_copy != view.end())
next_next_code_point = *it_copy;
}
bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
// WB6
if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
continue;
auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
// WB7
if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
continue;
// WB7a
if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
continue;
// WB7b
if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
continue;
// WB7c
if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
continue;
auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
// WB8
if (code_point_is_numeric && next_code_point_is_numeric)
continue;
// WB9
if (code_point_is_ah_letter && next_code_point_is_numeric)
continue;
// WB10
if (code_point_is_numeric && next_code_point_is_ah_letter)
continue;
auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
// WB11
if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
continue;
bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
// WB12
if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
continue;
auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
// WB13
if (code_point_is_katakana && next_code_point_is_katakana)
continue;
auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
// WB13a
if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
continue;
// WB13b
if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
continue;
auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
// WB15, WB16
if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
continue;
// WB999
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
}
}
// WB2
callback(code_unit_length(view));
#endif
}
void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
{
for_each_word_segmentation_boundary_impl(view, move(callback));
}
void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
{
for_each_word_segmentation_boundary_impl(view, move(callback));
}
void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
{
for_each_word_segmentation_boundary_impl(view, move(callback));
}
template<typename ViewType>
static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
{
#if ENABLE_UNICODE_DATA
using SBP = SentenceBreakProperty;
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
if (view.is_empty())
return;
auto has_any_sbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_sentence_break_property(code_point, properties) || ...);
};
// SB1
if (callback(0) == IterationDecision::Break)
return;
if (code_unit_length(view) > 1) {
auto it = view.begin();
auto code_point = *it;
u32 next_code_point;
Optional<u32> previous_code_point;
enum class TerminatorSequenceState {
None,
Term,
Close,
Sp
} terminator_sequence_state { TerminatorSequenceState::None };
auto term_was_a_term = false;
for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
next_code_point = *it;
auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
// SB3
if (code_point_is_cr && next_code_point_is_lf)
continue;
auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
// SB4
if (code_point_is_para_sep) {
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
continue;
}
// SB5
if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
continue;
auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
// SB6
if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
continue;
// SB7
if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
continue;
if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
terminator_sequence_state = TerminatorSequenceState::Term;
term_was_a_term = code_point_is_a_term;
} else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
terminator_sequence_state = TerminatorSequenceState::Close;
} else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
terminator_sequence_state = TerminatorSequenceState::Sp;
} else {
terminator_sequence_state = TerminatorSequenceState::None;
}
// SB8
if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
auto it_copy = it;
bool illegal_sequence = false;
for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
continue;
illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
}
if (illegal_sequence)
continue;
}
// SB8a
if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
continue;
auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
// SB9
if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
continue;
// SB10
if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
continue;
// SB11
if (terminator_sequence_state >= TerminatorSequenceState::Term)
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
// SB998
}
}
// SB2
callback(code_unit_length(view));
#endif
}
void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
{
for_each_sentence_segmentation_boundary_impl(view, move(callback));
}
void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
{
for_each_sentence_segmentation_boundary_impl(view, move(callback));
}
void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
{
for_each_sentence_segmentation_boundary_impl(view, move(callback));
}
}

View file

@ -1,134 +0,0 @@
/*
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Forward.h>
#include <AK/Function.h>
#include <AK/IterationDecision.h>
#include <AK/Optional.h>
#include <AK/Types.h>
namespace Unicode {
using SegmentationCallback = Function<IterationDecision(size_t)>;
void for_each_grapheme_segmentation_boundary(Utf8View const&, SegmentationCallback);
void for_each_grapheme_segmentation_boundary(Utf16View const&, SegmentationCallback);
void for_each_grapheme_segmentation_boundary(Utf32View const&, SegmentationCallback);
template<typename ViewType>
Optional<size_t> next_grapheme_segmentation_boundary(ViewType const& view, size_t index)
{
Optional<size_t> result;
for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
if (boundary > index) {
result = boundary;
return IterationDecision::Break;
}
return IterationDecision::Continue;
});
return result;
}
template<typename ViewType>
Optional<size_t> previous_grapheme_segmentation_boundary(ViewType const& view, size_t index)
{
Optional<size_t> result;
for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
if (boundary < index) {
result = boundary;
return IterationDecision::Continue;
}
return IterationDecision::Break;
});
return result;
}
void for_each_word_segmentation_boundary(Utf8View const&, SegmentationCallback);
void for_each_word_segmentation_boundary(Utf16View const&, SegmentationCallback);
void for_each_word_segmentation_boundary(Utf32View const&, SegmentationCallback);
template<typename ViewType>
Optional<size_t> next_word_segmentation_boundary(ViewType const& view, size_t index)
{
Optional<size_t> result;
for_each_word_segmentation_boundary(view, [&](auto boundary) {
if (boundary > index) {
result = boundary;
return IterationDecision::Break;
}
return IterationDecision::Continue;
});
return result;
}
template<typename ViewType>
Optional<size_t> previous_word_segmentation_boundary(ViewType const& view, size_t index)
{
Optional<size_t> result;
for_each_word_segmentation_boundary(view, [&](auto boundary) {
if (boundary < index) {
result = boundary;
return IterationDecision::Continue;
}
return IterationDecision::Break;
});
return result;
}
void for_each_sentence_segmentation_boundary(Utf8View const&, SegmentationCallback);
void for_each_sentence_segmentation_boundary(Utf16View const&, SegmentationCallback);
void for_each_sentence_segmentation_boundary(Utf32View const&, SegmentationCallback);
template<typename ViewType>
Optional<size_t> next_sentence_segmentation_boundary(ViewType const& view, size_t index)
{
Optional<size_t> result;
for_each_sentence_segmentation_boundary(view, [&](auto boundary) {
if (boundary > index) {
result = boundary;
return IterationDecision::Break;
}
return IterationDecision::Continue;
});
return result;
}
template<typename ViewType>
Optional<size_t> previous_sentence_segmentation_boundary(ViewType const& view, size_t index)
{
Optional<size_t> result;
for_each_sentence_segmentation_boundary(view, [&](auto boundary) {
if (boundary < index) {
result = boundary;
return IterationDecision::Continue;
}
return IterationDecision::Break;
});
return result;
}
}