Segmenter.cpp 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Utf16View.h>
  8. #include <LibJS/Runtime/GlobalObject.h>
  9. #include <LibJS/Runtime/Intl/Segmenter.h>
  10. #include <LibUnicode/Segmentation.h>
  11. namespace JS::Intl {
  12. JS_DEFINE_ALLOCATOR(Segmenter);
  13. // 18 Segmenter Objects, https://tc39.es/ecma402/#segmenter-objects
  14. Segmenter::Segmenter(Object& prototype)
  15. : Object(ConstructWithPrototypeTag::Tag, prototype)
  16. {
  17. }
  18. void Segmenter::set_segmenter_granularity(StringView segmenter_granularity)
  19. {
  20. if (segmenter_granularity == "grapheme"sv)
  21. m_segmenter_granularity = SegmenterGranularity::Grapheme;
  22. else if (segmenter_granularity == "word"sv)
  23. m_segmenter_granularity = SegmenterGranularity::Word;
  24. else if (segmenter_granularity == "sentence"sv)
  25. m_segmenter_granularity = SegmenterGranularity::Sentence;
  26. else
  27. VERIFY_NOT_REACHED();
  28. }
  29. StringView Segmenter::segmenter_granularity_string() const
  30. {
  31. switch (m_segmenter_granularity) {
  32. case SegmenterGranularity::Grapheme:
  33. return "grapheme"sv;
  34. case SegmenterGranularity::Word:
  35. return "word"sv;
  36. case SegmenterGranularity::Sentence:
  37. return "sentence"sv;
  38. default:
  39. VERIFY_NOT_REACHED();
  40. }
  41. }
  42. // 18.7.1 CreateSegmentDataObject ( segmenter, string, startIndex, endIndex ), https://tc39.es/ecma402/#sec-createsegmentdataobject
  43. ThrowCompletionOr<NonnullGCPtr<Object>> create_segment_data_object(VM& vm, Segmenter const& segmenter, Utf16View const& string, double start_index, double end_index)
  44. {
  45. auto& realm = *vm.current_realm();
  46. // 1. Let len be the length of string.
  47. auto length = string.length_in_code_units();
  48. // 2. Assert: startIndex ≥ 0.
  49. VERIFY(start_index >= 0);
  50. // 3. Assert: endIndex ≤ len.
  51. VERIFY(end_index <= length);
  52. // 4. Assert: startIndex < endIndex.
  53. VERIFY(start_index < end_index);
  54. // 5. Let result be OrdinaryObjectCreate(%Object.prototype%).
  55. auto result = Object::create(realm, realm.intrinsics().object_prototype());
  56. // 6. Let segment be the substring of string from startIndex to endIndex.
  57. auto segment = string.substring_view(start_index, end_index - start_index);
  58. // 7. Perform ! CreateDataPropertyOrThrow(result, "segment", segment).
  59. MUST(result->create_data_property_or_throw(vm.names.segment, PrimitiveString::create(vm, Utf16String::create(segment))));
  60. // 8. Perform ! CreateDataPropertyOrThrow(result, "index", 𝔽(startIndex)).
  61. MUST(result->create_data_property_or_throw(vm.names.index, Value(start_index)));
  62. // 9. Perform ! CreateDataPropertyOrThrow(result, "input", string).
  63. MUST(result->create_data_property_or_throw(vm.names.input, PrimitiveString::create(vm, Utf16String::create(string))));
  64. // 10. Let granularity be segmenter.[[SegmenterGranularity]].
  65. auto granularity = segmenter.segmenter_granularity();
  66. // 11. If granularity is "word", then
  67. if (granularity == Segmenter::SegmenterGranularity::Word) {
  68. // a. Let isWordLike be a Boolean value indicating whether the segment in string is "word-like" according to locale segmenter.[[Locale]].
  69. // TODO
  70. // b. Perform ! CreateDataPropertyOrThrow(result, "isWordLike", isWordLike).
  71. MUST(result->create_data_property_or_throw(vm.names.isWordLike, Value(false)));
  72. }
  73. // 12. Return result.
  74. return result;
  75. }
  76. static Optional<size_t> find_previous_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity)
  77. {
  78. switch (granularity) {
  79. case Segmenter::SegmenterGranularity::Grapheme:
  80. return Unicode::previous_grapheme_segmentation_boundary(string, index);
  81. case Segmenter::SegmenterGranularity::Word:
  82. return Unicode::previous_word_segmentation_boundary(string, index);
  83. case Segmenter::SegmenterGranularity::Sentence:
  84. return Unicode::previous_sentence_segmentation_boundary(string, index);
  85. }
  86. VERIFY_NOT_REACHED();
  87. }
  88. static Optional<size_t> find_next_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity)
  89. {
  90. switch (granularity) {
  91. case Segmenter::SegmenterGranularity::Grapheme:
  92. return Unicode::next_grapheme_segmentation_boundary(string, index);
  93. case Segmenter::SegmenterGranularity::Word:
  94. return Unicode::next_word_segmentation_boundary(string, index);
  95. case Segmenter::SegmenterGranularity::Sentence:
  96. return Unicode::next_sentence_segmentation_boundary(string, index);
  97. }
  98. VERIFY_NOT_REACHED();
  99. }
  100. // 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
  101. double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction)
  102. {
  103. // 1. Let locale be segmenter.[[Locale]].
  104. // FIXME: Support locale-sensitive boundaries
  105. // 2. Let granularity be segmenter.[[SegmenterGranularity]].
  106. auto granularity = segmenter.segmenter_granularity();
  107. // 3. Let len be the length of string.
  108. auto length = string.length_in_code_units();
  109. // 4. If direction is before, then
  110. if (direction == Direction::Before) {
  111. // a. Assert: startIndex ≥ 0.
  112. VERIFY(start_index >= 0);
  113. // b. Assert: startIndex < len.
  114. VERIFY(start_index < length);
  115. // c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
  116. auto boundary_index = find_previous_boundary_index(string, static_cast<size_t>(start_index) + 1, granularity);
  117. // d. If a boundary is found, return the count of code units in string preceding it.
  118. if (boundary_index.has_value())
  119. return static_cast<double>(*boundary_index);
  120. // e. Return 0.
  121. return 0;
  122. }
  123. // 5. Assert: direction is after.
  124. VERIFY(direction == Direction::After);
  125. // 6. If len is 0 or startIndex ≥ len, return +∞.
  126. if (length == 0 || start_index >= length)
  127. return INFINITY;
  128. // 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
  129. auto boundary_index = find_next_boundary_index(string, static_cast<size_t>(start_index), granularity);
  130. // 8. If a boundary is found, return the count of code units in string preceding it.
  131. if (boundary_index.has_value())
  132. return static_cast<double>(*boundary_index);
  133. // 9. Return len.
  134. return length;
  135. }
  136. }