2 年前 · dd4c47456e
--- a/Userland/Libraries/LibUnicode/Segmentation.cpp
+++ b/Userland/Libraries/LibUnicode/Segmentation.cpp
@@ -6,6 +6,7 @@
 
				  */
			
 
				 
			
 
				 #include <AK/Utf16View.h>
			
 
				+#include <AK/Utf32View.h>
			
 
				 #include <AK/Utf8View.h>
			
 
				 #include <LibUnicode/CharacterTypes.h>
			
 
				 #include <LibUnicode/Segmentation.h>
			
@@ -16,14 +17,41 @@
 
				 
			
 
				 namespace Unicode {
			
 
				 
			
 
				-Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
			
 
				+template<typename ViewType>
			
 
				+static size_t code_unit_length(ViewType const& view)
			
 
				+{
			
 
				+    if constexpr (IsSame<ViewType, Utf8View>)
			
 
				+        return view.byte_length();
			
 
				+    else if constexpr (IsSame<ViewType, Utf16View>)
			
 
				+        return view.length_in_code_units();
			
 
				+    else if constexpr (IsSame<ViewType, Utf32View>)
			
 
				+        return view.length();
			
 
				+    else
			
 
				+        static_assert(DependentFalse<ViewType>);
			
 
				+}
			
 
				+
			
 
				+template<typename ViewType, typename CodeUnitIterator>
			
 
				+static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
			
 
				+{
			
 
				+    if constexpr (IsSame<ViewType, Utf8View>)
			
 
				+        return view.byte_offset_of(it);
			
 
				+    else if constexpr (IsSame<ViewType, Utf16View>)
			
 
				+        return view.code_unit_offset_of(it);
			
 
				+    else if constexpr (IsSame<ViewType, Utf32View>)
			
 
				+        return view.iterator_offset(it);
			
 
				+    else
			
 
				+        static_assert(DependentFalse<ViewType>);
			
 
				+}
			
 
				+
			
 
				+template<typename ViewType>
			
 
				+static Vector<size_t> find_grapheme_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
			
 
				 {
			
 
				 #if ENABLE_UNICODE_DATA
			
 
				     using GBP = GraphemeBreakProperty;
			
 
				     Vector<size_t> boundaries;
			
 
				 
			
 
				     // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
			
 
				-    if (view.length_in_code_points() == 0)
			
 
				+    if (view.is_empty())
			
 
				         return boundaries;
			
 
				 
			
 
				     auto has_any_gbp = [](u32 code_point, auto&&... properties) {
			
@@ -33,7 +61,7 @@ Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View
 
				     // GB1
			
 
				     boundaries.append(0);
			
 
				 
			
 
				-    if (view.length_in_code_points() > 1) {
			
 
				+    if (code_unit_length(view) > 1) {
			
 
				         auto it = view.begin();
			
 
				         auto code_point = *it;
			
 
				         u32 next_code_point;
			
@@ -51,7 +79,7 @@ Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View
 
				                 continue;
			
 
				             // GB4, GB5
			
 
				             if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
			
 
				-                boundaries.append(view.code_unit_offset_of(it));
			
 
				+                boundaries.append(code_unit_offset_of(view, it));
			
 
				                 continue;
			
 
				             }
			
 
				 
			
@@ -96,18 +124,33 @@ Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View
 
				                 continue;
			
 
				 
			
 
				             // GB999
			
 
				-            boundaries.append(view.code_unit_offset_of(it));
			
 
				+            boundaries.append(code_unit_offset_of(view, it));
			
 
				         }
			
 
				     }
			
 
				 
			
 
				     // GB2
			
 
				-    boundaries.append(view.length_in_code_units());
			
 
				+    boundaries.append(code_unit_length(view));
			
 
				     return boundaries;
			
 
				 #else
			
 
				     return {};
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+Vector<size_t> find_grapheme_segmentation_boundaries(Utf8View const& view)
			
 
				+{
			
 
				+    return find_grapheme_segmentation_boundaries_impl(view);
			
 
				+}
			
 
				+
			
 
				+Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const& view)
			
 
				+{
			
 
				+    return find_grapheme_segmentation_boundaries_impl(view);
			
 
				+}
			
 
				+
			
 
				+Vector<size_t> find_grapheme_segmentation_boundaries(Utf32View const& view)
			
 
				+{
			
 
				+    return find_grapheme_segmentation_boundaries_impl(view);
			
 
				+}
			
 
				+
			
 
				 template<typename ViewType>
			
 
				 static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
			
 
				 {
			
@@ -123,31 +166,10 @@ static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] Vi
 
				         return (code_point_has_word_break_property(code_point, properties) || ...);
			
 
				     };
			
 
				 
			
 
				-    size_t code_unit_length = 0;
			
 
				-    size_t code_point_length = 0;
			
 
				-
			
 
				-    if constexpr (requires { view.byte_length(); }) {
			
 
				-        code_unit_length = view.byte_length();
			
 
				-        code_point_length = view.length();
			
 
				-    } else if constexpr (requires { view.length_in_code_units(); }) {
			
 
				-        code_unit_length = view.length_in_code_units();
			
 
				-        code_point_length = view.length_in_code_points();
			
 
				-    } else {
			
 
				-        static_assert(DependentFalse<ViewType>);
			
 
				-    }
			
 
				-
			
 
				-    auto code_unit_offset_of = [&](auto it) {
			
 
				-        if constexpr (requires { view.byte_offset_of(it); })
			
 
				-            return view.byte_offset_of(it);
			
 
				-        else if constexpr (requires { view.code_unit_offset_of(it); })
			
 
				-            return view.code_unit_offset_of(it);
			
 
				-        VERIFY_NOT_REACHED();
			
 
				-    };
			
 
				-
			
 
				     // WB1
			
 
				     boundaries.append(0);
			
 
				 
			
 
				-    if (code_point_length > 1) {
			
 
				+    if (code_unit_length(view) > 1) {
			
 
				         auto it = view.begin();
			
 
				         auto code_point = *it;
			
 
				         u32 next_code_point;
			
@@ -165,7 +187,7 @@ static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] Vi
 
				                 continue;
			
 
				             // WB3a, WB3b
			
 
				             if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
			
 
				-                boundaries.append(code_unit_offset_of(it));
			
 
				+                boundaries.append(code_unit_offset_of(view, it));
			
 
				                 continue;
			
 
				             }
			
 
				             // WB3c
			
@@ -270,12 +292,12 @@ static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] Vi
 
				                 continue;
			
 
				 
			
 
				             // WB999
			
 
				-            boundaries.append(code_unit_offset_of(it));
			
 
				+            boundaries.append(code_unit_offset_of(view, it));
			
 
				         }
			
 
				     }
			
 
				 
			
 
				     // WB2
			
 
				-    boundaries.append(code_unit_length);
			
 
				+    boundaries.append(code_unit_length(view));
			
 
				     return boundaries;
			
 
				 #else
			
 
				     return {};
			
@@ -292,14 +314,20 @@ Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
 
				     return find_word_segmentation_boundaries_impl(view);
			
 
				 }
			
 
				 
			
 
				-Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
			
 
				+Vector<size_t> find_word_segmentation_boundaries(Utf32View const& view)
			
 
				+{
			
 
				+    return find_word_segmentation_boundaries_impl(view);
			
 
				+}
			
 
				+
			
 
				+template<typename ViewType>
			
 
				+static Vector<size_t> find_sentence_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
			
 
				 {
			
 
				 #if ENABLE_UNICODE_DATA
			
 
				     using SBP = SentenceBreakProperty;
			
 
				     Vector<size_t> boundaries;
			
 
				 
			
 
				     // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
			
 
				-    if (view.length_in_code_points() == 0)
			
 
				+    if (view.is_empty())
			
 
				         return boundaries;
			
 
				 
			
 
				     auto has_any_sbp = [](u32 code_point, auto&&... properties) {
			
@@ -309,7 +337,7 @@ Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View
 
				     // SB1
			
 
				     boundaries.append(0);
			
 
				 
			
 
				-    if (view.length_in_code_points() > 1) {
			
 
				+    if (code_unit_length(view) > 1) {
			
 
				         auto it = view.begin();
			
 
				         auto code_point = *it;
			
 
				         u32 next_code_point;
			
@@ -336,7 +364,7 @@ Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View
 
				 
			
 
				             // SB4
			
 
				             if (code_point_is_para_sep) {
			
 
				-                boundaries.append(view.code_unit_offset_of(it));
			
 
				+                boundaries.append(code_unit_offset_of(view, it));
			
 
				                 continue;
			
 
				             }
			
 
				 
			
@@ -394,18 +422,33 @@ Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View
 
				 
			
 
				             // SB11
			
 
				             if (terminator_sequence_state >= TerminatorSequenceState::Term)
			
 
				-                boundaries.append(view.code_unit_offset_of(it));
			
 
				+                boundaries.append(code_unit_offset_of(view, it));
			
 
				 
			
 
				             // SB998
			
 
				         }
			
 
				     }
			
 
				 
			
 
				     // SB2
			
 
				-    boundaries.append(view.length_in_code_units());
			
 
				+    boundaries.append(code_unit_length(view));
			
 
				     return boundaries;
			
 
				 #else
			
 
				     return {};
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+Vector<size_t> find_sentence_segmentation_boundaries(Utf8View const& view)
			
 
				+{
			
 
				+    return find_sentence_segmentation_boundaries_impl(view);
			
 
				+}
			
 
				+
			
 
				+Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const& view)
			
 
				+{
			
 
				+    return find_sentence_segmentation_boundaries_impl(view);
			
 
				+}
			
 
				+
			
 
				+Vector<size_t> find_sentence_segmentation_boundaries(Utf32View const& view)
			
 
				+{
			
 
				+    return find_sentence_segmentation_boundaries_impl(view);
			
 
				+}
			
 
				+
			
 
				 }
			
--- a/Userland/Libraries/LibUnicode/Segmentation.h
+++ b/Userland/Libraries/LibUnicode/Segmentation.h
@@ -13,11 +13,16 @@
 
				 
			
 
				 namespace Unicode {
			
 
				 
			
 
				+Vector<size_t> find_grapheme_segmentation_boundaries(Utf8View const&);
			
 
				 Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
			
 
				+Vector<size_t> find_grapheme_segmentation_boundaries(Utf32View const&);
			
 
				 
			
 
				 Vector<size_t> find_word_segmentation_boundaries(Utf8View const&);
			
 
				 Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
			
 
				+Vector<size_t> find_word_segmentation_boundaries(Utf32View const&);
			
 
				 
			
 
				+Vector<size_t> find_sentence_segmentation_boundaries(Utf8View const&);
			
 
				 Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);
			
 
				+Vector<size_t> find_sentence_segmentation_boundaries(Utf32View const&);
			
 
				 
			
 
				 }