LibUnicode: Add an overload of word segmentation for UTF-8 strings

This commit is contained in:
Timothy Flynn 2023-01-16 09:39:12 -05:00 committed by Tim Flynn
parent 40cb41a16c
commit 6d710eeb43
Notes: sideshowbarker 2024-07-17 02:57:43 +09:00
2 changed files with 40 additions and 6 deletions

View file

@ -227,24 +227,46 @@ Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View
#endif
}
Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
template<typename ViewType>
static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
{
#if ENABLE_UNICODE_DATA
using WBP = WordBreakProperty;
Vector<size_t> boundaries;
// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
if (view.length_in_code_points() == 0)
if (view.is_empty())
return boundaries;
auto has_any_wbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_word_break_property(code_point, properties) || ...);
};
size_t code_unit_length = 0;
size_t code_point_length = 0;
if constexpr (requires { view.byte_length(); }) {
code_unit_length = view.byte_length();
code_point_length = view.length();
} else if constexpr (requires { view.length_in_code_units(); }) {
code_unit_length = view.length_in_code_units();
code_point_length = view.length_in_code_points();
} else {
static_assert(DependentFalse<ViewType>);
}
auto code_unit_offset_of = [&](auto it) {
if constexpr (requires { view.byte_offset_of(it); })
return view.byte_offset_of(it);
else if constexpr (requires { view.code_unit_offset_of(it); })
return view.code_unit_offset_of(it);
VERIFY_NOT_REACHED();
};
// WB1
boundaries.append(0);
if (view.length_in_code_points() > 1) {
if (code_point_length > 1) {
auto it = view.begin();
auto code_point = *it;
u32 next_code_point;
@ -262,7 +284,7 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
continue;
// WB3a, WB3b
if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
boundaries.append(view.code_unit_offset_of(it));
boundaries.append(code_unit_offset_of(it));
continue;
}
// WB3c
@ -367,18 +389,28 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
continue;
// WB999
boundaries.append(view.code_unit_offset_of(it));
boundaries.append(code_unit_offset_of(it));
}
}
// WB2
boundaries.append(view.length_in_code_units());
boundaries.append(code_unit_length);
return boundaries;
#else
return {};
#endif
}
Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view)
{
return find_word_segmentation_boundaries_impl(view);
}
Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
{
return find_word_segmentation_boundaries_impl(view);
}
Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
{
#if ENABLE_UNICODE_DATA

View file

@ -11,6 +11,7 @@
#include <AK/Optional.h>
#include <AK/Span.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h>
namespace Unicode {
@ -60,6 +61,7 @@ bool code_point_has_word_break_property(u32 code_point, WordBreakProperty proper
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_word_segmentation_boundaries(Utf8View const&);
Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);