Selaa lähdekoodia

LibUnicode: Implement grapheme segmentation

Idan Horowitz 3 vuotta sitten
vanhempi
commit
58b0eed6a7

+ 93 - 0
Userland/Libraries/LibUnicode/CharacterTypes.cpp

@@ -8,6 +8,7 @@
 #include <AK/Platform.h>
 #include <AK/StringBuilder.h>
 #include <AK/Types.h>
+#include <AK/Utf16View.h>
 #include <AK/Utf8View.h>
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/Locale.h>
@@ -357,4 +358,96 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB
 bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
 bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
 
+Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
+{
+#if ENABLE_UNICODE_DATA
+    using GBP = GraphemeBreakProperty;
+    Vector<size_t> boundaries;
+
+    // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+    if (view.length_in_code_points() == 0)
+        return boundaries;
+
+    auto has_any_gbp = [](u32 code_point, auto&&... properties) {
+        return (code_point_has_grapheme_break_property(code_point, properties) || ...);
+    };
+
+    // GB1
+    boundaries.append(0);
+
+    if (view.length_in_code_points() > 1) {
+        auto it = view.begin();
+        auto code_point = *it;
+        u32 next_code_point;
+        auto current_ri_chain = 0;
+        auto in_emoji_sequence = false;
+
+        for (++it; it != view.end(); ++it, code_point = next_code_point) {
+            next_code_point = *it;
+
+            auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
+            auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
+
+            // GB3
+            if (code_point_is_cr && next_code_point_is_lf)
+                continue;
+            // GB4, GB5
+            if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
+                boundaries.append(view.code_unit_offset_of(it));
+                continue;
+            }
+
+            auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
+            auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
+
+            // GB6
+            if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
+                continue;
+            // GB7
+            if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
+                continue;
+            // GB8
+            if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
+                continue;
+
+            auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
+            if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
+                in_emoji_sequence = true;
+            else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
+                in_emoji_sequence = false;
+
+            // GB9
+            if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
+                continue;
+            // GB9a
+            if (has_any_gbp(next_code_point, GBP::SpacingMark))
+                continue;
+            // GB9b
+            if (has_any_gbp(code_point, GBP::Prepend))
+                continue;
+
+            // GB11
+            if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
+                continue;
+
+            auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
+            current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
+
+            // GB12, GB13
+            if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
+                continue;
+
+            // GB999
+            boundaries.append(view.code_unit_offset_of(it));
+        }
+    }
+
+    // GB2
+    boundaries.append(view.length_in_code_units());
+    return boundaries;
+#else
+    return {};
+#endif
+}
+
 }

+ 2 - 0
Userland/Libraries/LibUnicode/CharacterTypes.h

@@ -44,4 +44,6 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert
 bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
 bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
 
+Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
+
 }