Browse Source

LibJS: Implement the Intl.Segmenter FindBoundary AO

Idan Horowitz 3 years ago
parent
commit
cea6c81c77

+ 72 - 0
Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp

@@ -4,7 +4,10 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/BinarySearch.h>
+#include <AK/Utf16View.h>
 #include <LibJS/Runtime/Intl/Segmenter.h>
+#include <LibUnicode/CharacterTypes.h>
 
 namespace JS::Intl {
 
@@ -40,4 +43,73 @@ StringView Segmenter::segmenter_granularity_string() const
     }
 }
 
+// 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
+double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction, Optional<Vector<size_t>>& boundaries_cache)
+{
+    // 1. Let locale be segmenter.[[Locale]].
+    auto const& locale = segmenter.locale();
+
+    // 2. Let granularity be segmenter.[[SegmenterGranularity]].
+    auto granularity = segmenter.segmenter_granularity();
+
+    // 3. Let len be the length of string.
+    auto length = string.length_in_code_units();
+
+    // Non-standard, populate boundaries cache
+    if (!boundaries_cache.has_value()) {
+        switch (granularity) {
+        case Segmenter::SegmenterGranularity::Grapheme:
+            boundaries_cache = Unicode::find_grapheme_segmentation_boundaries(string);
+            break;
+        case Segmenter::SegmenterGranularity::Word:
+            boundaries_cache = Unicode::find_word_segmentation_boundaries(string);
+            break;
+        case Segmenter::SegmenterGranularity::Sentence:
+            boundaries_cache = Unicode::find_sentence_segmentation_boundaries(string);
+            break;
+        default:
+            VERIFY_NOT_REACHED();
+        }
+    }
+    (void)locale; // TODO: Support locale-sensitive boundaries
+
+    // 4. If direction is before, then
+    if (direction == Direction::Before) {
+        // a. Assert: startIndex ≥ 0.
+        VERIFY(start_index >= 0);
+        // b. Assert: startIndex < len.
+        VERIFY(start_index < length);
+
+        // c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
+        size_t boundary_index;
+        binary_search(*boundaries_cache, start_index, &boundary_index);
+
+        // d. If a boundary is found, return the count of code units in string preceding it.
+        if (boundary_index < boundaries_cache->size())
+            return boundaries_cache->at(boundary_index);
+
+        // e. Return 0.
+        return 0;
+    }
+
+    // 5. Assert: direction is after.
+    VERIFY(direction == Direction::After);
+
+    // 6. If len is 0 or startIndex ≥ len, return +∞.
+    if (length == 0 || start_index >= length)
+        return INFINITY;
+
+    // 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
+    size_t boundary_index;
+    binary_search(*boundaries_cache, start_index, &boundary_index);
+    ++boundary_index;
+
+    // 8. If a boundary is found, return the count of code units in string preceding it.
+    if (boundary_index < boundaries_cache->size())
+        return boundaries_cache->at(boundary_index);
+
+    // 9. Return len.
+    return length;
+}
+
 }

+ 6 - 0
Userland/Libraries/LibJS/Runtime/Intl/Segmenter.h

@@ -36,4 +36,10 @@ private:
     SegmenterGranularity m_segmenter_granularity { SegmenterGranularity::Grapheme }; // [[SegmenterGranularity]]
 };
 
+enum class Direction {
+    Before,
+    After,
+};
+double find_boundary(Segmenter const&, Utf16View const&, double start_index, Direction, Optional<Vector<size_t>>& boundaries_cache);
+
 }

+ 4 - 0
Userland/Libraries/LibJS/Runtime/Intl/Segments.h

@@ -25,11 +25,15 @@ public:
 
     Utf16View segments_string() const { return m_segments_string.view(); }
 
+    Optional<Vector<size_t>>& boundaries_cache() const { return m_boundaries_cache; }
+
 private:
     virtual void visit_edges(Cell::Visitor&) override;
 
     Segmenter& m_segments_segmenter; // [[SegmentsSegmenter]]
     Utf16String m_segments_string;   // [[SegmentsString]]
+
+    mutable Optional<Vector<size_t>> m_boundaries_cache;
 };
 
 }