Segmenter.cpp 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BinarySearch.h>
  7. #include <AK/Utf16View.h>
  8. #include <LibJS/Runtime/GlobalObject.h>
  9. #include <LibJS/Runtime/Intl/Segmenter.h>
  10. #include <LibUnicode/Segmentation.h>
  11. namespace JS::Intl {
  12. // 18 Segmenter Objects, https://tc39.es/ecma402/#segmenter-objects
  13. Segmenter::Segmenter(Object& prototype)
  14. : Object(ConstructWithPrototypeTag::Tag, prototype)
  15. {
  16. }
  17. void Segmenter::set_segmenter_granularity(StringView segmenter_granularity)
  18. {
  19. if (segmenter_granularity == "grapheme"sv)
  20. m_segmenter_granularity = SegmenterGranularity::Grapheme;
  21. else if (segmenter_granularity == "word"sv)
  22. m_segmenter_granularity = SegmenterGranularity::Word;
  23. else if (segmenter_granularity == "sentence"sv)
  24. m_segmenter_granularity = SegmenterGranularity::Sentence;
  25. else
  26. VERIFY_NOT_REACHED();
  27. }
  28. StringView Segmenter::segmenter_granularity_string() const
  29. {
  30. switch (m_segmenter_granularity) {
  31. case SegmenterGranularity::Grapheme:
  32. return "grapheme"sv;
  33. case SegmenterGranularity::Word:
  34. return "word"sv;
  35. case SegmenterGranularity::Sentence:
  36. return "sentence"sv;
  37. default:
  38. VERIFY_NOT_REACHED();
  39. }
  40. }
  41. // 18.7.1 CreateSegmentDataObject ( segmenter, string, startIndex, endIndex ), https://tc39.es/ecma402/#sec-createsegmentdataobject
  42. ThrowCompletionOr<NonnullGCPtr<Object>> create_segment_data_object(VM& vm, Segmenter const& segmenter, Utf16View const& string, double start_index, double end_index)
  43. {
  44. auto& realm = *vm.current_realm();
  45. // 1. Let len be the length of string.
  46. auto length = string.length_in_code_units();
  47. // 2. Assert: startIndex ≥ 0.
  48. VERIFY(start_index >= 0);
  49. // 3. Assert: endIndex ≤ len.
  50. VERIFY(end_index <= length);
  51. // 4. Assert: startIndex < endIndex.
  52. VERIFY(start_index < end_index);
  53. // 5. Let result be OrdinaryObjectCreate(%Object.prototype%).
  54. auto result = Object::create(realm, realm.intrinsics().object_prototype());
  55. // 6. Let segment be the substring of string from startIndex to endIndex.
  56. auto segment = string.substring_view(start_index, end_index - start_index);
  57. // 7. Perform ! CreateDataPropertyOrThrow(result, "segment", segment).
  58. MUST(result->create_data_property_or_throw(vm.names.segment, PrimitiveString::create(vm, TRY(Utf16String::create(vm, segment)))));
  59. // 8. Perform ! CreateDataPropertyOrThrow(result, "index", 𝔽(startIndex)).
  60. MUST(result->create_data_property_or_throw(vm.names.index, Value(start_index)));
  61. // 9. Perform ! CreateDataPropertyOrThrow(result, "input", string).
  62. MUST(result->create_data_property_or_throw(vm.names.input, PrimitiveString::create(vm, TRY(Utf16String::create(vm, string)))));
  63. // 10. Let granularity be segmenter.[[SegmenterGranularity]].
  64. auto granularity = segmenter.segmenter_granularity();
  65. // 11. If granularity is "word", then
  66. if (granularity == Segmenter::SegmenterGranularity::Word) {
  67. // a. Let isWordLike be a Boolean value indicating whether the segment in string is "word-like" according to locale segmenter.[[Locale]].
  68. // TODO
  69. // b. Perform ! CreateDataPropertyOrThrow(result, "isWordLike", isWordLike).
  70. MUST(result->create_data_property_or_throw(vm.names.isWordLike, Value(false)));
  71. }
  72. // 12. Return result.
  73. return result;
  74. }
  75. // 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
  76. double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction, Optional<Vector<size_t>>& boundaries_cache)
  77. {
  78. // 1. Let locale be segmenter.[[Locale]].
  79. auto const& locale = segmenter.locale();
  80. // 2. Let granularity be segmenter.[[SegmenterGranularity]].
  81. auto granularity = segmenter.segmenter_granularity();
  82. // 3. Let len be the length of string.
  83. auto length = string.length_in_code_units();
  84. // Non-standard, populate boundaries cache
  85. if (!boundaries_cache.has_value()) {
  86. switch (granularity) {
  87. case Segmenter::SegmenterGranularity::Grapheme:
  88. boundaries_cache = Unicode::find_grapheme_segmentation_boundaries(string);
  89. break;
  90. case Segmenter::SegmenterGranularity::Word:
  91. boundaries_cache = Unicode::find_word_segmentation_boundaries(string);
  92. break;
  93. case Segmenter::SegmenterGranularity::Sentence:
  94. boundaries_cache = Unicode::find_sentence_segmentation_boundaries(string);
  95. break;
  96. default:
  97. VERIFY_NOT_REACHED();
  98. }
  99. }
  100. (void)locale; // TODO: Support locale-sensitive boundaries
  101. // 4. If direction is before, then
  102. if (direction == Direction::Before) {
  103. // a. Assert: startIndex ≥ 0.
  104. VERIFY(start_index >= 0);
  105. // b. Assert: startIndex < len.
  106. VERIFY(start_index < length);
  107. // c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
  108. size_t boundary_index;
  109. binary_search(*boundaries_cache, start_index, &boundary_index);
  110. // d. If a boundary is found, return the count of code units in string preceding it.
  111. if (boundary_index < boundaries_cache->size())
  112. return boundaries_cache->at(boundary_index);
  113. // e. Return 0.
  114. return 0;
  115. }
  116. // 5. Assert: direction is after.
  117. VERIFY(direction == Direction::After);
  118. // 6. If len is 0 or startIndex ≥ len, return +∞.
  119. if (length == 0 || start_index >= length)
  120. return INFINITY;
  121. // 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
  122. size_t boundary_index;
  123. binary_search(*boundaries_cache, start_index, &boundary_index);
  124. ++boundary_index;
  125. // 8. If a boundary is found, return the count of code units in string preceding it.
  126. if (boundary_index < boundaries_cache->size())
  127. return boundaries_cache->at(boundary_index);
  128. // 9. Return len.
  129. return length;
  130. }
  131. }