mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-21 23:20:20 +00:00
LibUnicode: Remove the locale-unaware text segmentation implementation
This commit is contained in:
parent
3974996e95
commit
ab56b8c8dc
Notes:
sideshowbarker
2024-07-17 20:33:50 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/ab56b8c8dc Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/218
10 changed files with 1 additions and 808 deletions
|
@ -43,15 +43,6 @@ set(EMOJI_DATA_PATH "${UCD_PATH}/${EMOJI_DATA_SOURCE}")
|
|||
set(NORM_PROPS_SOURCE "DerivedNormalizationProps.txt")
|
||||
set(NORM_PROPS_PATH "${UCD_PATH}/${NORM_PROPS_SOURCE}")
|
||||
|
||||
set(GRAPHEME_BREAK_PROP_SOURCE "auxiliary/GraphemeBreakProperty.txt")
|
||||
set(GRAPHEME_BREAK_PROP_PATH "${UCD_PATH}/${GRAPHEME_BREAK_PROP_SOURCE}")
|
||||
|
||||
set(WORD_BREAK_PROP_SOURCE "auxiliary/WordBreakProperty.txt")
|
||||
set(WORD_BREAK_PROP_PATH "${UCD_PATH}/${WORD_BREAK_PROP_SOURCE}")
|
||||
|
||||
set(SENTENCE_BREAK_PROP_SOURCE "auxiliary/SentenceBreakProperty.txt")
|
||||
set(SENTENCE_BREAK_PROP_PATH "${UCD_PATH}/${SENTENCE_BREAK_PROP_SOURCE}")
|
||||
|
||||
string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}")
|
||||
set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt")
|
||||
set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt")
|
||||
|
@ -76,9 +67,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${EMOJI_DATA_SOURCE}" "${EMOJI_DATA_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NORM_PROPS_SOURCE}" "${NORM_PROPS_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${GRAPHEME_BREAK_PROP_SOURCE}" "${GRAPHEME_BREAK_PROP_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${WORD_BREAK_PROP_SOURCE}" "${WORD_BREAK_PROP_PATH}")
|
||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SENTENCE_BREAK_PROP_SOURCE}" "${SENTENCE_BREAK_PROP_PATH}")
|
||||
|
||||
download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}")
|
||||
else()
|
||||
|
@ -103,7 +91,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
"${UCD_VERSION_FILE}"
|
||||
"${UNICODE_DATA_HEADER}"
|
||||
"${UNICODE_DATA_IMPLEMENTATION}"
|
||||
arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
|
||||
arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}"
|
||||
)
|
||||
invoke_generator(
|
||||
"EmojiData"
|
||||
|
|
|
@ -98,17 +98,10 @@ struct UnicodeData {
|
|||
// FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
|
||||
NormalizationProps normalization_props;
|
||||
|
||||
PropList grapheme_break_props;
|
||||
PropList word_break_props;
|
||||
PropList sentence_break_props;
|
||||
|
||||
CodePointTables<PropertyTable> general_category_tables;
|
||||
CodePointTables<PropertyTable> property_tables;
|
||||
CodePointTables<PropertyTable> script_tables;
|
||||
CodePointTables<PropertyTable> script_extension_tables;
|
||||
CodePointTables<PropertyTable> grapheme_break_tables;
|
||||
CodePointTables<PropertyTable> word_break_tables;
|
||||
CodePointTables<PropertyTable> sentence_break_tables;
|
||||
|
||||
HashTable<ByteString> bidirectional_classes;
|
||||
Vector<CodePointBidiClass> code_point_bidirectional_classes;
|
||||
|
@ -439,9 +432,6 @@ namespace Unicode {
|
|||
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
|
||||
generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
|
||||
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
|
||||
generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
|
||||
generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
|
||||
generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
|
||||
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
|
||||
|
||||
generator.append(R"~~~(
|
||||
|
@ -560,9 +550,6 @@ static constexpr Array<@type@, @size@> @name@ { {
|
|||
TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table));
|
||||
TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table));
|
||||
TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table));
|
||||
TRY(append_code_point_tables("s_grapheme_break_properties"sv, unicode_data.grapheme_break_tables, append_property_table));
|
||||
TRY(append_code_point_tables("s_word_break_properties"sv, unicode_data.word_break_tables, append_property_table));
|
||||
TRY(append_code_point_tables("s_sentence_break_properties"sv, unicode_data.sentence_break_tables, append_property_table));
|
||||
|
||||
{
|
||||
constexpr size_t max_bidi_classes_per_row = 20;
|
||||
|
@ -654,10 +641,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
|
|||
TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv));
|
||||
TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases));
|
||||
|
||||
TRY(append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv));
|
||||
TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv));
|
||||
TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv));
|
||||
|
||||
TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {}));
|
||||
|
||||
generator.append(R"~~~(
|
||||
|
@ -921,18 +904,12 @@ static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data)
|
|||
auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list));
|
||||
auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list));
|
||||
auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions));
|
||||
auto grapheme_break_metadata = TRY(PropertyMetadata::create(unicode_data.grapheme_break_props));
|
||||
auto word_break_metadata = TRY(PropertyMetadata::create(unicode_data.word_break_props));
|
||||
auto sentence_break_metadata = TRY(PropertyMetadata::create(unicode_data.sentence_break_props));
|
||||
|
||||
for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) {
|
||||
TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.grapheme_break_tables, grapheme_break_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.word_break_tables, word_break_metadata));
|
||||
TRY(update_property_tables(code_point, unicode_data.sentence_break_tables, sentence_break_metadata));
|
||||
}
|
||||
|
||||
return {};
|
||||
|
@ -953,9 +930,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
StringView script_extensions_path;
|
||||
StringView emoji_data_path;
|
||||
StringView normalization_path;
|
||||
StringView grapheme_break_path;
|
||||
StringView word_break_path;
|
||||
StringView sentence_break_path;
|
||||
|
||||
Core::ArgsParser args_parser;
|
||||
args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
|
||||
|
@ -971,9 +945,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
|
||||
args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
|
||||
args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
|
||||
args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path");
|
||||
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
|
||||
args_parser.add_option(sentence_break_path, "Path to SentenceBreakProperty.txt file", "sentence-break-path", 'i', "sentence-break-path");
|
||||
args_parser.parse(arguments);
|
||||
|
||||
auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
|
||||
|
@ -989,9 +960,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
|
||||
auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read));
|
||||
auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read));
|
||||
auto grapheme_break_file = TRY(open_file(grapheme_break_path, Core::File::OpenMode::Read));
|
||||
auto word_break_file = TRY(open_file(word_break_path, Core::File::OpenMode::Read));
|
||||
auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read));
|
||||
|
||||
UnicodeData unicode_data {};
|
||||
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
|
||||
|
@ -1003,9 +971,6 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases));
|
||||
TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
|
||||
TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
|
||||
TRY(parse_prop_list(*grapheme_break_file, unicode_data.grapheme_break_props));
|
||||
TRY(parse_prop_list(*word_break_file, unicode_data.word_break_props));
|
||||
TRY(parse_prop_list(*sentence_break_file, unicode_data.sentence_break_props));
|
||||
|
||||
populate_general_category_unions(unicode_data.general_categories);
|
||||
TRY(parse_unicode_data(*unicode_data_file, unicode_data));
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
set(TEST_SOURCES
|
||||
TestEmoji.cpp
|
||||
TestIDNA.cpp
|
||||
TestSegmentation.cpp
|
||||
TestUnicodeCharacterTypes.cpp
|
||||
TestUnicodeNormalization.cpp
|
||||
)
|
||||
|
|
|
@ -1,122 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Array.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibTest/TestCase.h>
|
||||
#include <LibUnicode/Segmentation.h>
|
||||
|
||||
template<size_t N>
|
||||
static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
Utf8View view { string };
|
||||
|
||||
Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(grapheme_segmentation)
|
||||
{
|
||||
Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) {
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_grapheme_segmentation("a"sv, { 0u, 1u });
|
||||
test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
|
||||
test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
|
||||
|
||||
test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
|
||||
test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
|
||||
test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
|
||||
|
||||
test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
|
||||
test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
|
||||
|
||||
test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_grapheme_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
}
|
||||
|
||||
TEST_CASE(grapheme_segmentation_indic_conjunct_break)
|
||||
{
|
||||
test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
|
||||
test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
|
||||
test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
|
||||
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
|
||||
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
|
||||
}
|
||||
|
||||
template<size_t N>
|
||||
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
Utf8View view { string };
|
||||
|
||||
Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(word_segmentation)
|
||||
{
|
||||
Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) {
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_word_segmentation("a"sv, { 0u, 1u });
|
||||
test_word_segmentation("ab"sv, { 0u, 2u });
|
||||
test_word_segmentation("abc"sv, { 0u, 3u });
|
||||
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
|
||||
test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
|
||||
test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
|
||||
|
||||
test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_word_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
|
||||
test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
|
||||
test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
|
||||
test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
|
||||
|
||||
test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
|
||||
|
||||
test_word_segmentation(
|
||||
"The quick (“brown”) fox can’t jump 32.3 feet, right?"sv,
|
||||
{ 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
|
||||
}
|
|
@ -6,7 +6,6 @@ set(SOURCES
|
|||
Emoji.cpp
|
||||
IDNA.cpp
|
||||
Normalize.cpp
|
||||
Segmentation.cpp
|
||||
String.cpp
|
||||
${UNICODE_DATA_SOURCES}
|
||||
)
|
||||
|
|
|
@ -88,10 +88,6 @@ Optional<Script> __attribute__((weak)) script_from_string(StringView) { return {
|
|||
bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; }
|
||||
|
||||
bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
|
||||
|
||||
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
|
||||
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
|
||||
|
||||
|
|
|
@ -37,10 +37,6 @@ Optional<Script> script_from_string(StringView);
|
|||
bool code_point_has_script(u32 code_point, Script script);
|
||||
bool code_point_has_script_extension(u32 code_point, Script script);
|
||||
|
||||
bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakProperty property);
|
||||
bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
|
||||
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
|
||||
|
||||
Optional<BidirectionalClass> bidirectional_class_from_string(StringView);
|
||||
Optional<BidirectionalClass> bidirectional_class(u32 code_point);
|
||||
|
||||
|
|
|
@ -13,11 +13,8 @@ namespace Unicode {
|
|||
enum class BidirectionalClass : u8;
|
||||
enum class EmojiGroup : u8;
|
||||
enum class GeneralCategory : u8;
|
||||
enum class GraphemeBreakProperty : u8;
|
||||
enum class Property : u8;
|
||||
enum class Script : u8;
|
||||
enum class SentenceBreakProperty : u8;
|
||||
enum class WordBreakProperty : u8;
|
||||
|
||||
struct CurrencyCode;
|
||||
struct Emoji;
|
||||
|
|
|
@ -1,491 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
|
||||
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/Segmentation.h>
|
||||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
# include <LibUnicode/UnicodeData.h>
|
||||
#endif
|
||||
|
||||
namespace Unicode {
|
||||
|
||||
template<typename ViewType>
|
||||
static size_t code_unit_length(ViewType const& view)
|
||||
{
|
||||
if constexpr (IsSame<ViewType, Utf8View>)
|
||||
return view.byte_length();
|
||||
else if constexpr (IsSame<ViewType, Utf16View>)
|
||||
return view.length_in_code_units();
|
||||
else if constexpr (IsSame<ViewType, Utf32View>)
|
||||
return view.length();
|
||||
else
|
||||
static_assert(DependentFalse<ViewType>);
|
||||
}
|
||||
|
||||
template<typename ViewType, typename CodeUnitIterator>
|
||||
static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
|
||||
{
|
||||
if constexpr (IsSame<ViewType, Utf8View>)
|
||||
return view.byte_offset_of(it);
|
||||
else if constexpr (IsSame<ViewType, Utf16View>)
|
||||
return view.code_unit_offset_of(it);
|
||||
else if constexpr (IsSame<ViewType, Utf32View>)
|
||||
return view.iterator_offset(it);
|
||||
else
|
||||
static_assert(DependentFalse<ViewType>);
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
using GBP = GraphemeBreakProperty;
|
||||
|
||||
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
if (view.is_empty())
|
||||
return;
|
||||
|
||||
auto has_any_gbp = [](u32 code_point, auto&&... properties) {
|
||||
return (code_point_has_grapheme_break_property(code_point, properties) || ...);
|
||||
};
|
||||
|
||||
auto skip_incb_extend_linker_sequence = [&](auto& it) {
|
||||
while (true) {
|
||||
if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
|
||||
return;
|
||||
|
||||
auto next_it = it;
|
||||
++next_it;
|
||||
|
||||
if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
|
||||
return;
|
||||
|
||||
it = next_it;
|
||||
++it;
|
||||
}
|
||||
};
|
||||
|
||||
// GB1
|
||||
if (callback(0) == IterationDecision::Break)
|
||||
return;
|
||||
|
||||
if (code_unit_length(view) > 1) {
|
||||
auto it = view.begin();
|
||||
auto code_point = *it;
|
||||
u32 next_code_point = 0;
|
||||
auto current_ri_chain = 0;
|
||||
|
||||
for (++it; it != view.end(); ++it, code_point = next_code_point) {
|
||||
next_code_point = *it;
|
||||
|
||||
// GB9c
|
||||
if (code_point_has_property(code_point, Property::InCB_Consonant)) {
|
||||
auto it_copy = it;
|
||||
skip_incb_extend_linker_sequence(it_copy);
|
||||
|
||||
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
|
||||
++it_copy;
|
||||
skip_incb_extend_linker_sequence(it_copy);
|
||||
|
||||
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
|
||||
next_code_point = *it_copy;
|
||||
it = it_copy;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GB11
|
||||
if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
|
||||
auto it_copy = it;
|
||||
|
||||
while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
|
||||
++it_copy;
|
||||
|
||||
if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
|
||||
++it_copy;
|
||||
|
||||
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
|
||||
next_code_point = *it_copy;
|
||||
it = it_copy;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
|
||||
auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
|
||||
|
||||
// GB3
|
||||
if (code_point_is_cr && next_code_point_is_lf)
|
||||
continue;
|
||||
// GB4, GB5
|
||||
if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
|
||||
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
|
||||
auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
|
||||
|
||||
// GB6
|
||||
if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
|
||||
continue;
|
||||
// GB7
|
||||
if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
|
||||
continue;
|
||||
// GB8
|
||||
if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
|
||||
continue;
|
||||
|
||||
// GB9
|
||||
if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
|
||||
continue;
|
||||
// GB9a
|
||||
if (has_any_gbp(next_code_point, GBP::SpacingMark))
|
||||
continue;
|
||||
// GB9b
|
||||
if (has_any_gbp(code_point, GBP::Prepend))
|
||||
continue;
|
||||
|
||||
auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
|
||||
current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
|
||||
|
||||
// GB12, GB13
|
||||
if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
|
||||
continue;
|
||||
|
||||
// GB999
|
||||
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// GB2
|
||||
callback(code_unit_length(view));
|
||||
#endif
|
||||
}
|
||||
|
||||
void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_grapheme_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_grapheme_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_grapheme_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
using WBP = WordBreakProperty;
|
||||
|
||||
// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
|
||||
if (view.is_empty())
|
||||
return;
|
||||
|
||||
auto has_any_wbp = [](u32 code_point, auto&&... properties) {
|
||||
return (code_point_has_word_break_property(code_point, properties) || ...);
|
||||
};
|
||||
|
||||
// WB1
|
||||
if (callback(0) == IterationDecision::Break)
|
||||
return;
|
||||
|
||||
if (code_unit_length(view) > 1) {
|
||||
auto it = view.begin();
|
||||
auto code_point = *it;
|
||||
u32 next_code_point;
|
||||
Optional<u32> previous_code_point;
|
||||
auto current_ri_chain = 0;
|
||||
|
||||
for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
|
||||
next_code_point = *it;
|
||||
|
||||
auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
|
||||
auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
|
||||
|
||||
// WB3
|
||||
if (code_point_is_cr && next_code_point_is_lf)
|
||||
continue;
|
||||
// WB3a, WB3b
|
||||
if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
|
||||
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
// WB3c
|
||||
if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
|
||||
continue;
|
||||
// WB3d
|
||||
if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
|
||||
continue;
|
||||
|
||||
// WB4
|
||||
if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
|
||||
continue;
|
||||
|
||||
auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
|
||||
auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
|
||||
auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
|
||||
auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
|
||||
|
||||
// WB5
|
||||
if (code_point_is_ah_letter && next_code_point_is_ah_letter)
|
||||
continue;
|
||||
|
||||
Optional<u32> next_next_code_point;
|
||||
if (it != view.end()) {
|
||||
auto it_copy = it;
|
||||
++it_copy;
|
||||
if (it_copy != view.end())
|
||||
next_next_code_point = *it_copy;
|
||||
}
|
||||
bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
|
||||
bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
|
||||
|
||||
auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
|
||||
|
||||
// WB6
|
||||
if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
|
||||
continue;
|
||||
|
||||
auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
|
||||
auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
|
||||
auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
|
||||
|
||||
// WB7
|
||||
if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
|
||||
continue;
|
||||
// WB7a
|
||||
if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
|
||||
continue;
|
||||
// WB7b
|
||||
if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
|
||||
continue;
|
||||
// WB7c
|
||||
if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
|
||||
continue;
|
||||
|
||||
auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
|
||||
auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
|
||||
|
||||
// WB8
|
||||
if (code_point_is_numeric && next_code_point_is_numeric)
|
||||
continue;
|
||||
// WB9
|
||||
if (code_point_is_ah_letter && next_code_point_is_numeric)
|
||||
continue;
|
||||
// WB10
|
||||
if (code_point_is_numeric && next_code_point_is_ah_letter)
|
||||
continue;
|
||||
|
||||
auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
|
||||
|
||||
// WB11
|
||||
if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
|
||||
continue;
|
||||
|
||||
bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
|
||||
|
||||
// WB12
|
||||
if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
|
||||
continue;
|
||||
|
||||
auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
|
||||
auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
|
||||
|
||||
// WB13
|
||||
if (code_point_is_katakana && next_code_point_is_katakana)
|
||||
continue;
|
||||
|
||||
auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
|
||||
|
||||
// WB13a
|
||||
if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
|
||||
continue;
|
||||
// WB13b
|
||||
if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
|
||||
continue;
|
||||
|
||||
auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
|
||||
current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
|
||||
|
||||
// WB15, WB16
|
||||
if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
|
||||
continue;
|
||||
|
||||
// WB999
|
||||
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// WB2
|
||||
callback(code_unit_length(view));
|
||||
#endif
|
||||
}
|
||||
|
||||
void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_word_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_word_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_word_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
using SBP = SentenceBreakProperty;
|
||||
|
||||
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
if (view.is_empty())
|
||||
return;
|
||||
|
||||
auto has_any_sbp = [](u32 code_point, auto&&... properties) {
|
||||
return (code_point_has_sentence_break_property(code_point, properties) || ...);
|
||||
};
|
||||
|
||||
// SB1
|
||||
if (callback(0) == IterationDecision::Break)
|
||||
return;
|
||||
|
||||
if (code_unit_length(view) > 1) {
|
||||
auto it = view.begin();
|
||||
auto code_point = *it;
|
||||
u32 next_code_point;
|
||||
Optional<u32> previous_code_point;
|
||||
enum class TerminatorSequenceState {
|
||||
None,
|
||||
Term,
|
||||
Close,
|
||||
Sp
|
||||
} terminator_sequence_state { TerminatorSequenceState::None };
|
||||
auto term_was_a_term = false;
|
||||
|
||||
for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
|
||||
next_code_point = *it;
|
||||
|
||||
auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
|
||||
auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
|
||||
|
||||
// SB3
|
||||
if (code_point_is_cr && next_code_point_is_lf)
|
||||
continue;
|
||||
|
||||
auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
|
||||
|
||||
// SB4
|
||||
if (code_point_is_para_sep) {
|
||||
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
|
||||
// SB5
|
||||
if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
|
||||
continue;
|
||||
|
||||
auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
|
||||
|
||||
// SB6
|
||||
if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
|
||||
continue;
|
||||
// SB7
|
||||
if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
|
||||
continue;
|
||||
|
||||
if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
|
||||
terminator_sequence_state = TerminatorSequenceState::Term;
|
||||
term_was_a_term = code_point_is_a_term;
|
||||
} else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
|
||||
terminator_sequence_state = TerminatorSequenceState::Close;
|
||||
} else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
|
||||
terminator_sequence_state = TerminatorSequenceState::Sp;
|
||||
} else {
|
||||
terminator_sequence_state = TerminatorSequenceState::None;
|
||||
}
|
||||
|
||||
// SB8
|
||||
if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
|
||||
auto it_copy = it;
|
||||
bool illegal_sequence = false;
|
||||
for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
|
||||
if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
|
||||
continue;
|
||||
illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
|
||||
}
|
||||
if (illegal_sequence)
|
||||
continue;
|
||||
}
|
||||
|
||||
// SB8a
|
||||
if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
|
||||
continue;
|
||||
|
||||
auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
|
||||
auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
|
||||
|
||||
// SB9
|
||||
if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
|
||||
continue;
|
||||
|
||||
// SB10
|
||||
if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
|
||||
continue;
|
||||
|
||||
// SB11
|
||||
if (terminator_sequence_state >= TerminatorSequenceState::Term)
|
||||
if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
|
||||
return;
|
||||
|
||||
// SB998
|
||||
}
|
||||
}
|
||||
|
||||
// SB2
|
||||
callback(code_unit_length(view));
|
||||
#endif
|
||||
}
|
||||
|
||||
void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_sentence_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_sentence_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
|
||||
{
|
||||
for_each_sentence_segmentation_boundary_impl(view, move(callback));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,134 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
|
||||
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/Function.h>
|
||||
#include <AK/IterationDecision.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/Types.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
||||
using SegmentationCallback = Function<IterationDecision(size_t)>;
|
||||
|
||||
void for_each_grapheme_segmentation_boundary(Utf8View const&, SegmentationCallback);
|
||||
void for_each_grapheme_segmentation_boundary(Utf16View const&, SegmentationCallback);
|
||||
void for_each_grapheme_segmentation_boundary(Utf32View const&, SegmentationCallback);
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> next_grapheme_segmentation_boundary(ViewType const& view, size_t index)
|
||||
{
|
||||
Optional<size_t> result;
|
||||
|
||||
for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
|
||||
if (boundary > index) {
|
||||
result = boundary;
|
||||
return IterationDecision::Break;
|
||||
}
|
||||
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> previous_grapheme_segmentation_boundary(ViewType const& view, size_t index)
|
||||
{
|
||||
Optional<size_t> result;
|
||||
|
||||
for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
|
||||
if (boundary < index) {
|
||||
result = boundary;
|
||||
return IterationDecision::Continue;
|
||||
}
|
||||
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void for_each_word_segmentation_boundary(Utf8View const&, SegmentationCallback);
|
||||
void for_each_word_segmentation_boundary(Utf16View const&, SegmentationCallback);
|
||||
void for_each_word_segmentation_boundary(Utf32View const&, SegmentationCallback);
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> next_word_segmentation_boundary(ViewType const& view, size_t index)
|
||||
{
|
||||
Optional<size_t> result;
|
||||
|
||||
for_each_word_segmentation_boundary(view, [&](auto boundary) {
|
||||
if (boundary > index) {
|
||||
result = boundary;
|
||||
return IterationDecision::Break;
|
||||
}
|
||||
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> previous_word_segmentation_boundary(ViewType const& view, size_t index)
|
||||
{
|
||||
Optional<size_t> result;
|
||||
|
||||
for_each_word_segmentation_boundary(view, [&](auto boundary) {
|
||||
if (boundary < index) {
|
||||
result = boundary;
|
||||
return IterationDecision::Continue;
|
||||
}
|
||||
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void for_each_sentence_segmentation_boundary(Utf8View const&, SegmentationCallback);
|
||||
void for_each_sentence_segmentation_boundary(Utf16View const&, SegmentationCallback);
|
||||
void for_each_sentence_segmentation_boundary(Utf32View const&, SegmentationCallback);
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> next_sentence_segmentation_boundary(ViewType const& view, size_t index)
|
||||
{
|
||||
Optional<size_t> result;
|
||||
|
||||
for_each_sentence_segmentation_boundary(view, [&](auto boundary) {
|
||||
if (boundary > index) {
|
||||
result = boundary;
|
||||
return IterationDecision::Break;
|
||||
}
|
||||
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> previous_sentence_segmentation_boundary(ViewType const& view, size_t index)
|
||||
{
|
||||
Optional<size_t> result;
|
||||
|
||||
for_each_sentence_segmentation_boundary(view, [&](auto boundary) {
|
||||
if (boundary < index) {
|
||||
result = boundary;
|
||||
return IterationDecision::Continue;
|
||||
}
|
||||
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue